# <div align="center">CM1</div>


###  1.1 Required Libraries

In [None]:
import pandas as pd
import numpy as np
import re
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import nltk
# nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
import gensim.downloader as api
from datasets import load_dataset
from scipy import spatial
import math

### 1.2 Load dataset 

- Identified there are large number of null values exist in columns, so in this assignment we have verified that there will be very less data exist if we remove null values.

- Moreover we will focus on text dataset to build a model. Hence there is no good reason to normalise or remove nll values.


**Dataset Overview :**

- Climate Fever dataset uses Fever methodology that consists of 1,535 real-world claims regarding climate-change collected on the internet.

- Each claim is accompanied by five manually annotated evidence ( Evidence 0 to 4 ) sentences retrieved from the  Wikipedia that support, refute or do not give enough information to validate the claim.

**Dataset Feature :**

1) claim_id : An unique claim identifier in datset.
2) claim :  claim text.
3) claim_label :   Overall label assigned to claim (based on evidence majority vote), The label correspond to 0: "refutes", 1: "supports" and 2: "not enough info".
4) evidences : A list of evidences with below fields : 
    1. evidence_id : An unique evidence identifier.
    2. evidence_label : A micro-verdict label, The label correspond to 0: "refutes", 1: "supports" and 2: "not enough info".
    3. article : A title of source article (Wikipedia page).
    4. evidence : An evidence sentence.
    5. entropy : An entropy reflecting uncertainty of evidence_label.
    6. votes : Refers to individual votes.
    
    
**Note :** In each claim there are total 5 evidences avaialbke in dataset ( From evidence 0 to 4).

In [None]:
dataset = load_dataset('climate_fever')
dataset['test']

### 1.3 Assigned claim  and evidence feature values to corpus ( In order to to apply word embedding )

In [None]:
sent_list = list()
for i in range(0, dataset["test"].num_rows):
    claim = dataset["test"][i]["claim"]
    sent_list.append(claim)
   
    for _, data in enumerate(dataset["test"][i]["evidences"]):
        article = data["article"]
        evidence = data["evidence"]
        sent_list.append(article)
        sent_list.append(evidence)

### 1.4 Text/Data Preprocessing :

- NLTK :
    - We used nltk.corpus package and stopwords library to remove stopwords from corpus.
- PorterStemmer :
    - Stemming is the process of producing morphological variants of a root/base word.
    - Stemming programs are commonly referred to as stemming algorithms or stemmers. A stemming algorithm reduces the words “chocolates”, “chocolatey”, “choco” to the root word, “chocolate”.
    
**Errors in Stemming :**
There are mainly two errors in stemming – **Overstemming** and **Understemming**. 
- Overstemming occurs when two words are stemmed to same root that are of different stems. 
- Under-stemming occurs when two words are stemmed to same root that are not of different stems.


In [31]:

claim_corpus = []

for i in range (0,1534):
    sentences = re.sub('[^a-zA-z]',' ', corpus[i])
    sentences = sentences.lower()
    sentences = sentences.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    sentences = [ps.stem(word) for word in sentences if not word in set(all_stopwords)]
    sentences = ' '.join(sentences)
    claim_corpus.append(sentences)
    

### 1.5 Converted every word of the corpus to embedding vectors in order to embed the text dataset with Word2Vec

In [32]:
corpusVec = [nltk.word_tokenize(sentences) for sentences in claim_corpus]

### 1.6 Build a word2Vec model

- Word2Vec model maps words to real number vectors, at the same time capturing something about the meaning of the text. It says that if two words have similar meaning they will lie close to each other in the dense space. 
- Word2Vec model contains two models for training Skip-Gram model and continuous bag of words(CBOW).

**Parameters :**
- min_count (int) – Ignores all words with total frequency lower than min_count value
- size (int) – Dimensionality of the feature vectors.
- window (int) – The maximum distance between the current and predicted word within a sentence.
- seed (int) – Seed for the random number generator. Initial vectors for each word are seeded with a hash of the concatenation of word + str(seed).

**Note :**
- We have build a Word2vec model on entire dataset and then after we have split word embeddings into train and test set.

In [61]:
from gensim.models import Word2Vec
model = Word2Vec(corpusVec,  min_count =1)
model.save("word2vec.bin")

### 1.7 Split word embeddings into train and test sets

- Train set (corpusVec_train) contains 80% of word embeddings.
- Test set (corpusVec_test) contains 20% of word embeddings.

In [34]:
from sklearn.model_selection import train_test_split
corpusVec_train, corpusVec_test = train_test_split(corpusVec,test_size=0.20,random_state = None)

#### 1.7.1 Created dictionaries to store word and it's embedding

- 2 dictionaries are created ( 1 for train set and another for test set)
- As a window_size in Word2Vec model is 100 (default value) for each word there will be 100 numbers attached to it.

In [35]:
embedding_train = dict()
embedding_test = dict()


for sentence in corpusVec_train:
    for word in sentence:
        embedding_train[word] = model.wv[word]
        
for sentence in corpusVec_test:
    for word in sentence:
        embedding_test[word] = model.wv[word]

In [36]:
len(embedding_train.keys())

2871

In [37]:
len(embedding_test)

1526

#### 1.7.2 Created train and test dataframe from embeddings

In [38]:
train_emb_df = pd.DataFrame.from_dict(embedding_train)
test_emb_df = pd.DataFrame.from_dict(embedding_test)

In [39]:
train_emb_df.head()

Unnamed: 0,earli,septemb,okeechobe,hurrican,made,landfal,near,west,palm,beach,...,till,coldest,themyscira,amazon,attack,creator,splice,sic,caet,unab
0,0.013419,0.008864,0.00545,0.049957,0.028665,0.00371,0.039027,0.032782,0.014662,0.006397,...,0.001144,-0.000966,0.002964,0.009367,-0.000677,0.004301,-0.00203,0.001605,-0.000997,0.004693
1,-0.037344,-0.01336,0.00098,-0.10748,-0.062355,-0.013175,-0.075213,-0.064591,-0.027819,-0.007026,...,0.000229,-0.004554,-0.004754,-0.014114,-0.006255,-0.006049,-0.003033,-0.000787,-0.00845,8.4e-05
2,-0.023527,-0.008161,0.002991,-0.058274,-0.032705,-0.007996,-0.038736,-0.040957,-0.019946,-0.001076,...,-0.0014,-0.00521,-0.003454,-0.004284,-0.001268,-0.004208,0.001467,0.002422,-0.003174,0.001529
3,-0.025865,-0.016536,-0.000808,-0.089662,-0.048211,-0.009439,-0.064055,-0.053204,-0.026004,-0.004001,...,-0.005228,-0.001169,-0.007759,-0.008618,-0.006645,-0.005677,-0.006545,-0.007614,-0.007933,-0.002934
4,-0.01068,-0.004536,-5.6e-05,-0.046348,-0.032381,-0.002989,-0.033829,-0.025641,-0.010468,-0.001888,...,-0.005748,-0.004302,-0.000989,-0.011115,-0.00674,-0.004463,0.000131,0.003083,-0.006569,-0.000615


In [59]:
print(model)

Word2Vec(vocab=3232, size=100, alpha=0.025)


#### 1.7.3 Features of Word2Vec :

- After the model is trained, it is accessible via the “wv” attribute..This is the actual word vector model in which queries can be made.
- We can also save and load out Word2Vec model using "model.save("word2vec.bin")" and "model = Word2Vec.load('model.bin')" respectively.
- We can print the learned vocabulary of tokens (words) as follows:

In [58]:
words = model.wv.vocab
# print(words)


### 1.8 Cosine Similarity Function ( In order to calulate text similarity ): 

- Among different distance metrics, cosine similarity is more intuitive and most used in word2vec. It is normalized dot product of 2 vectors and this ratio defines the angle between them.
- Cosine similiarity between two vectors (A and B) can be calculated using dot(A, B)/(norm(A)*norm(B)). Here norm(A) and norm(B) indicates euclidean norm of vectors respectively.

**Note :** In order to build a function for cosine similarity we used scipy.spatial.distance.cosine.

In [42]:
def cos_similarity(v1, v2):
    return abs(1 - spatial.distance.cosine(v1,v2))

#### 1.8.1 Cosine Similarity Function Examples

In [43]:
ex1 = cos_similarity(embedding_train["slower"], embedding_train["global"])
ex2 = cos_similarity(embedding_train["high"], embedding_train["low"])
ex3 = cos_similarity(embedding_train["peopl"], embedding_train["human"])
ex4 = cos_similarity(embedding_train["australia"], embedding_train["warm"])
ex5 = cos_similarity(embedding_train["scientists"], embedding_train["guidelin"])
print(f"Cosine Similarity between 'slower'and 'global' is : {ex1}")
print(f"Cosine Similarity between 'high'and 'low' is : {ex2}")
print(f"Cosine Similarity between 'peopl'and 'human' is : {ex3}")
print(f"Cosine Similarity between 'australia'and 'warm' is : {ex4}")
print(f"Cosine Similarity between 'scientists'and 'guidelin' is : {ex5}")

Cosine Similarity between 'slower'and 'global' is : 0.9800435900688171
Cosine Similarity between 'high'and 'low' is : 0.9986007809638977
Cosine Similarity between 'peopl'and 'human' is : 0.9858113527297974
Cosine Similarity between 'australia'and 'warm' is : 0.9992226958274841
Cosine Similarity between 'scientists'and 'guidelin' is : 0.47627413272857666


#### 1.8.2 Analysis of Cosine Similarity 

- Cosine Similarity measured by the cosine of **the angle between two vectors** and determines whether two vectors are pointing in roughly the same direction or not.
- This similarity score ranges from 0 to 1, with 0 being the lowest (the least similar) and 1 being the highest (the most similar).
-  A cosine value of 0 means that the two vectors are at 90 degrees to each other (orthogonal) and have no match. 
- The closer the cosine value to 1, the smaller the angle and the greater the match between vectors.
- From above examples and embedded vector for a specific token ( model.wv[word] ) it is cleared that **more difference** between embedded vector for a specific token then lower value for cosine simillarity. ( for example scientists and guidelin)

### 1.9 Airthmetic computations on embedding vectors

- After successfully build a Word2Vec model, we can do create a little linear algebra arithmetic with words.
- Gensim provides an interface for performing these types of operations in the most_similar() function on the trained or loaded model.


Example 1 : **ocean - water + arctic**

In [69]:
model.wv.most_similar(positive=['ocean', 'arctic'], negative=['water'])

[('anthropogen', 0.9971425533294678),
 ('chang', 0.9971103072166443),
 ('per', 0.9970500469207764),
 ('like', 0.9970331788063049),
 ('also', 0.9970329999923706),
 ('year', 0.9970141649246216),
 ('result', 0.9969948530197144),
 ('report', 0.996990442276001),
 ('global', 0.9969826936721802),
 ('sinc', 0.9969472885131836)]

Example 2 :  **carbon - dioxide + oxygen** 

In [78]:
model.wv.most_similar(positive=['carbon', 'oxygen'], negative=['dioxide'])

[('train', 0.5851308703422546),
 ('neutral', 0.5764247179031372),
 ('heart', 0.5706777572631836),
 ('usd', 0.5703952312469482),
 ('corp', 0.5674421191215515),
 ('microbi', 0.565788984298706),
 ('context', 0.5622187852859497),
 ('threaten', 0.5617069005966187),
 ('noth', 0.5592998266220093),
 ('guidelin', 0.5588611364364624)]

Example 3: **'heat' - 'rise' + 'cold'**

In [66]:
model.wv.most_similar(positive=['heat', 'cold', 'warm'], negative=['rise'])
#model.wv.most_similar('heat')

[('model', 0.9943777322769165),
 ('food', 0.9943655729293823),
 ('recent', 0.994345486164093),
 ('continu', 0.9943419694900513),
 ('weather', 0.9942892789840698),
 ('research', 0.994236946105957),
 ('high', 0.9942196607589722),
 ('scientist', 0.994118332862854),
 ('rate', 0.9941150546073914),
 ('report', 0.9941065311431885)]

Example 4 : **'high' - 'temperatur' + 'heat'**

In [64]:
model.wv.most_similar(positive=['high', 'heat'], negative=['temperatur'])

[('event', 0.9989520907402039),
 ('weather', 0.9989404082298279),
 ('record', 0.9988930225372314),
 ('water', 0.9988844990730286),
 ('climat', 0.9988824129104614),
 ('chang', 0.9988715648651123),
 ('report', 0.998858630657196),
 ('use', 0.998855710029602),
 ('cycl', 0.998854398727417),
 ('world', 0.9988488554954529)]

Example 4 : **'climat' + 'chang' + 'temperatur'**

In [None]:
model.wv.most_similar(positive=['climat', 'chang', 'temperatur'])

Example 5 : **'hurrican' - 'storm' + 'wind'**

In [None]:
model.wv.most_similar(positive=['hurrican', 'wind'], negative=['storm'])

In [53]:
### Examples : 
print(model.wv.similarity('high', 'low'))
print(model.wv.similarity('peopl', 'human'))
# model.wv.similarity('upstream', 'downstream')
print(model.wv.similarity('australia', 'warm'))
print(model.wv.similarity('shrink', 'compress'))

0.99860054
0.9858114
0.99922276
0.5082087


In [46]:
model.wv.similarity('heat', 'water')

0.9996577

### 1.10 Loaded pretrained model and performed 

We have loaded below to pretrained model :

1. glove-wiki-gigaword-50 model
2. GoogleNewsvectorsnegative300 Model

In [47]:
import gensim.downloader as api
word_vectors = api.load("glove-wiki-gigaword-50")
#print(api.load('glove-wiki-gigaword-50', return_path=True))

In [79]:
#word_vectors.wv.most_similar(positive=['heat', 'volcano'], negative=['ice'])

word_vectors.most_similar('heat')

[('temperature', 0.7556608319282532),
 ('cold', 0.7549160718917847),
 ('temperatures', 0.7502944469451904),
 ('humidity', 0.7431684732437134),
 ('hot', 0.7388110160827637),
 ('moisture', 0.7275059819221497),
 ('chill', 0.7268701195716858),
 ('water', 0.7164075970649719),
 ('heating', 0.7090611457824707),
 ('add', 0.7034947872161865)]

**Example 1 in Pretrained model :**

In [49]:
word_vectors.most_similar(positive=['ocean', 'arctic'], negative=['water'])

  X1 = model[model.wv.vocab]


**Example 2 in Pretrained model :**

In [None]:
word_vectors.most_similar(positive=['carbon', 'oxygen'], negative=['dioxide'])

**Example 3 in Pretrained model :**

In [None]:
word_vectors.most_similar(positive=['heat', 'cold', 'warm'], negative=['rise'])

**Example 4 in Pretrained model :**

In [None]:
word_vectors.most_similar(positive=['climat', 'chang', 'temperatur'])

**Example 5 in Pretrained model :**

In [None]:
word_vectors.most_similar(positive=['hurrican', 'wind'], negative=['storm'])

In [92]:
from gensim import models
import gensim.downloader as api
#word_vectors2 = api.load("GoogleNews-vectors-negative300.bin.gz")

pr_model2 = api.load("glove-twitter-25") 





In [93]:
pr_model2.most_similar('heat')

[('thunder', 0.9267944097518921),
 ('bulls', 0.911600649356842),
 ('ball', 0.9085021615028381),
 ('beat', 0.9076730012893677),
 ('playoffs', 0.8956629037857056),
 ('lakers', 0.8936164379119873),
 ('basketball', 0.8829619884490967),
 ('cowboys', 0.8791283965110779),
 ('baseball', 0.8777887225151062),
 ('celtics', 0.8775449991226196)]

In [1]:

from gensim.models import Word2Vec
import gensim
# load the Stanford GloVe model
#filename = 'GoogleNews-vectors-negative300.bin'
#trained_model = KeyedVectors.load_word2vec_format(filename, binary=False)
m1 = gensim.models.KeyedVectors.load_word2vec_format('GoogleNewsvectorsnegative300.bin', binary=True)

In [None]:
m1.most_similar('heat')

In [None]:
water = model.wv["year"]
ice = model.wv["ice"]
similarity = cosine_similarity(water, ice)
print(f"The Cosine Similarity between 'water' and 'ice': {similarity:}.")

### Cosine Similarity Function 

In [None]:
cos_similarity(embedding_train["slower"], embedding_train["global"])

In [None]:
model.wv.similarity('slower', 'global')