In [49]:
import spacy

In [50]:
nlp = spacy.load('en_core_web_lg')

In [51]:
doc = nlp("dog cat banana kem")

# now we will prin each word in doc, check whether it has a vecor or not and also check whether the word is
# out of vocabulary (OOV). en_core_web_lg was trained/built using all the wikipedia articles (vocabulary) set

for token in doc:
    print(token.text, 'Vector:', token.has_vector, 'OOV:', token.is_oov)

dog Vector: True OOV: False
cat Vector: True OOV: False
banana Vector: True OOV: False
kem Vector: False OOV: True


In [54]:
doc

dog cat banana kem

In [53]:
type(doc)

spacy.tokens.doc.Doc

In [55]:
doc[0]

dog

In [7]:
doc[0].vector

array([ 1.2330e+00,  4.2963e+00, -7.9738e+00, -1.0121e+01,  1.8207e+00,
        1.4098e+00, -4.5180e+00, -5.2261e+00, -2.9157e-01,  9.5234e-01,
        6.9880e+00,  5.0637e+00, -5.5726e-03,  3.3395e+00,  6.4596e+00,
       -6.3742e+00,  3.9045e-02, -3.9855e+00,  1.2085e+00, -1.3186e+00,
       -4.8886e+00,  3.7066e+00, -2.8281e+00, -3.5447e+00,  7.6888e-01,
        1.5016e+00, -4.3632e+00,  8.6480e+00, -5.9286e+00, -1.3055e+00,
        8.3870e-01,  9.0137e-01, -1.7843e+00, -1.0148e+00,  2.7300e+00,
       -6.9039e+00,  8.0413e-01,  7.4880e+00,  6.1078e+00, -4.2130e+00,
       -1.5384e-01, -5.4995e+00,  1.0896e+01,  3.9278e+00, -1.3601e-01,
        7.7732e-02,  3.2218e+00, -5.8777e+00,  6.1359e-01, -2.4287e+00,
        6.2820e+00,  1.3461e+01,  4.3236e+00,  2.4266e+00, -2.6512e+00,
        1.1577e+00,  5.0848e+00, -1.7058e+00,  3.3824e+00,  3.2850e+00,
        1.0969e+00, -8.3711e+00, -1.5554e+00,  2.0296e+00, -2.6796e+00,
       -6.9195e+00, -2.3386e+00, -1.9916e+00, -3.0450e+00,  2.48

In [9]:
doc[0].vector.shape

(300,)

In [10]:
base_token = nlp('bread')
base_token[0].vector.shape

(300,)

In [11]:
doc = nlp("bread sandwich burger car tiger human wheat")

# en_core_web_lg has been trained on ALL the entries/articles on
# wikipedia, then in that huge text dataset if the words appear in similar context then similarity value will be 
# higher. So the similarity is based on words appearing in similar contexts in that dataset and because of that
# the similarity value itself will depend upon the dataset that was used to create the vectors for each word.
# Actually the vectors that are generated for each word will also depend on the dataset the en_core_web_lg
# is trained on

for token in doc:
    print(f'{token.text} <-> {base_token.text}:', token.similarity(base_token[0]))

bread <-> bread: 1.0
sandwich <-> bread: 0.6341067552566528
burger <-> bread: 0.4752069115638733
car <-> bread: 0.06451533734798431
tiger <-> bread: 0.047646112740039825
human <-> bread: 0.21511542797088623
wheat <-> bread: 0.615036129951477


In [12]:
def print_similarity(base_word, words_to_compare):
    base_token = nlp(base_word)
    doc = nlp(words_to_compare)
    for token in doc:
        print(f'{token.text} <-> {base_token.text}:', token.similarity(base_token))

In [13]:
print_similarity("iphone", "apple samsung iphone dog kitten")

apple <-> iphone: 0.4387907748060368
samsung <-> iphone: 0.6708590303423401
iphone <-> iphone: 0.9999999983096304
dog <-> iphone: 0.08211864228011527
kitten <-> iphone: 0.10222317834969896


In [15]:
king = nlp.vocab['king'].vector
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector
queen = nlp.vocab['queen'].vector

result = king - man + woman

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity([result], [queen])

array([[0.6178014]], dtype=float32)

## Fake news classifier

In [57]:
import pandas as pd

df = pd.read_csv('C:\\Users\\User\\Desktop\\Datasets\\fake_real_news.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [18]:
df = df.iloc[:,2:]

In [19]:
df.head()

Unnamed: 0,text,label
0,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,It's primary day in New York and front-runners...,REAL


In [20]:
df.shape

(6335, 2)

In [21]:
from sklearn.preprocessing import LabelEncoder

encoder=LabelEncoder()
df['label'] = encoder.fit_transform(df['label'])

In [23]:
df['label'].value_counts()

label
1    3171
0    3164
Name: count, dtype: int64

In [24]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [25]:
doc = nlp('Daniel Greenfield, a Shillman Journalism')
doc.vector.shape

(300,)

In [26]:
df['vector'] = df['text'].apply(lambda x: nlp(x).vector)

In [27]:
df.head()

Unnamed: 0,text,label,vector
0,"Daniel Greenfield, a Shillman Journalism Fello...",0,"[-1.3751823, 1.3421849, -2.3666484, 0.12908486..."
1,Google Pinterest Digg Linkedin Reddit Stumbleu...,0,"[-1.7449774, 0.93961924, -2.024867, 0.42536643..."
2,U.S. Secretary of State John F. Kerry said Mon...,1,"[-1.9426425, 1.0062195, -1.9992222, 0.20469022..."
3,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",0,"[-1.9125352, -0.1481846, -1.1432766, 0.6861217..."
4,It's primary day in New York and front-runners...,1,"[-1.8516092, 1.3163909, -2.1726575, 1.2286776,..."


In [30]:
df['vector'][0]

array([-1.3751823 ,  1.3421849 , -2.3666484 ,  0.12908486,  4.270479  ,
        0.51925766,  0.44712344,  4.256378  , -0.258568  , -0.69181013,
        5.816825  ,  0.912634  , -2.8267076 ,  0.6135212 ,  0.9216893 ,
        1.8403614 ,  1.0790504 , -0.62502784, -1.0741416 , -1.1811537 ,
        1.187674  , -0.7152485 , -0.54194546, -1.0991569 , -0.6336021 ,
       -1.6232888 , -2.2061875 , -1.0777613 , -0.76670575,  1.1433407 ,
        0.9485234 , -0.25826022, -0.6409617 , -1.3894422 , -2.7554905 ,
       -0.47828907, -0.7560415 ,  1.2054269 ,  1.877219  ,  1.2506902 ,
       -0.33462027,  0.50827944,  0.07255519, -0.06292999, -1.8812821 ,
        0.89956325,  1.0648625 , -2.9592528 , -1.0185561 ,  2.09682   ,
       -0.8460555 ,  1.1176676 ,  0.743881  , -4.517312  , -0.43132237,
       -0.11841195,  0.63771945,  0.93854254,  0.8386211 ,  0.17864674,
        0.76933694, -0.59235567, -0.08093308, -1.4929782 ,  1.5234507 ,
        1.3508835 , -2.7877965 , -3.2130456 ,  0.67457646,  2.27

In [31]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(df.vector.values, df.label, test_size=0.2, random_state=0)

In [32]:
X_train.shape

(5068,)

In [33]:
X_test.shape

(1267,)

In [34]:
# X_train is a numpy array of numpy arrays, but to give it as input to a classifier we need to convert it
# to a 2-D numpy array. We can do it using np.stack function
X_train

array([array([-1.61986899e+00,  1.33421493e+00, -1.35586178e+00,  5.18004298e-01,
               4.69462872e+00,  1.36738017e-01,  1.77956045e-01,  3.30067706e+00,
              -1.79211095e-01, -8.72039139e-01,  5.55393505e+00,  9.98799205e-01,
              -2.81162977e+00,  4.59265649e-01,  6.82270646e-01,  2.03440595e+00,
               1.30965984e+00, -1.18151136e-01, -5.97718835e-01, -4.81164217e-01,
               9.93831575e-01, -7.37450004e-01, -9.26415324e-01, -3.63480955e-01,
               3.26350808e-01, -1.26044559e+00, -1.99598646e+00, -8.91666174e-01,
              -3.93535569e-02,  7.41871715e-01,  9.48715732e-02, -3.28487545e-01,
              -2.35230520e-01, -8.54804754e-01, -2.83622646e+00, -1.14813268e+00,
              -6.17452562e-01,  9.27654862e-01,  4.78723019e-01,  3.60162705e-01,
              -1.02181554e-01,  7.40911588e-02, -3.05617571e-01,  4.16660458e-01,
              -1.28456891e+00,  7.27809429e-01, -1.58052981e-01, -2.12075996e+00,
              -7

In [35]:
# X_train is a numpy array of numpy arrays, but to give it as input to a classifier we need to convert it
# to a 2-D numpy array. We can do it using np.stack function

import numpy as np

X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

In [36]:
X_train_2d

array([[-1.619869  ,  1.3342149 , -1.3558618 , ..., -0.04046927,
        -2.761496  ,  0.7598182 ],
       [-2.3222246 ,  0.7897645 , -1.8041078 , ..., -0.9577217 ,
        -2.373393  ,  0.5841433 ],
       [-1.2018781 ,  1.4973024 , -2.5473328 , ..., -0.92211866,
        -2.9232326 ,  1.4042891 ],
       ...,
       [-1.7931817 ,  1.1534967 , -2.06488   , ..., -0.45617864,
        -2.8729343 ,  0.8529313 ],
       [-2.864977  ,  0.01813644, -1.0462892 , ..., -2.8005648 ,
        -0.906828  ,  0.14209157],
       [-2.444136  ,  0.47068468, -2.1141195 , ..., -0.84001374,
        -2.1663854 ,  0.4640902 ]], dtype=float32)

In [37]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(X_train_2d,y_train)

ValueError: Negative values in data passed to MultinomialNB (input X)

In [38]:
# MultinomialNB doesn't accept negative values as input so lets use a scaler

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2d)
scaled_test_embed = scaler.transform(X_test_2d)

In [39]:
clf.fit(scaled_train_embed, y_train)

In [40]:
y_pred = clf.predict(scaled_test_embed)

In [42]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.70      0.74       615
           1       0.74      0.83      0.79       652

    accuracy                           0.77      1267
   macro avg       0.77      0.77      0.77      1267
weighted avg       0.77      0.77      0.77      1267



In [45]:
from sklearn.neighbors import KNeighborsClassifier

clf1 = KNeighborsClassifier(n_neighbors = 5, metric = 'euclidean')
clf1.fit(scaled_train_embed,y_train)

In [46]:
y_pred1 = clf1.predict(scaled_test_embed)

print(classification_report(y_test,y_pred1))

              precision    recall  f1-score   support

           0       0.87      0.71      0.78       615
           1       0.77      0.90      0.83       652

    accuracy                           0.81      1267
   macro avg       0.82      0.80      0.80      1267
weighted avg       0.82      0.81      0.80      1267



In [48]:
from sklearn.ensemble import RandomForestClassifier

clf2 = RandomForestClassifier()
clf2.fit(scaled_train_embed,y_train)

y_pred2 = clf2.predict(scaled_test_embed)
print(classification_report(y_test,y_pred2))

              precision    recall  f1-score   support

           0       0.84      0.86      0.85       615
           1       0.86      0.85      0.86       652

    accuracy                           0.85      1267
   macro avg       0.85      0.85      0.85      1267
weighted avg       0.85      0.85      0.85      1267



### Key Takeaways
- KNN model which didn't perform well in the vectorization techniques like Bag of words, and TF-IDF due to very high dimensional vector space, performed really well with glove vectors due to only 300-dimensional vectors and very good embeddings(similar and related words have almost similar embeddings) for the given text data.

- MultinomialNB model performed decently well but did not come into the top list because in the 300-dimensional vectors we also have the negative values present. The Naive Bayes model does not fit the data if there are negative values. So, to overcome this shortcoming, we have used the Min-Max scaler to bring down all the values between 0 to 1. In this process, there will be a possibility of variance and information loss among the data. But anyhow we got a decent recall and f1 scores.