In [1]:
import pickle
from collections import Counter

import nltk

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud

from gensim.models import Word2Vec

In [2]:
df = pd.read_pickle('../data/cleaned_tweets_test.pkl')

In [3]:
df.head()

Unnamed: 0,text,label,category,cleaned,num_words
0,we are living in scary times in canada. gov’t ...,0,unreliable,living scary time canada gov refuse protect ca...,9
1,"just as bad in canada. in fact, our government...",0,unreliable,bad canada fact government accusing us racism ...,10
2,it was only a matter of time before the mainst...,0,unreliable,matter time mainstream medium decided blame co...,10
3,russia's taking no chances: foreigners infecte...,0,unreliable,russia taking chance foreigner infected new ch...,13
4,although there is now a presumptive confirmed ...,0,unreliable,although presumptive confirmed case wuhan nove...,11


In [4]:
df.label.value_counts()

0    280
1    277
Name: label, dtype: int64

In [5]:
df.category.value_counts()

unreliable    280
true          277
Name: category, dtype: int64

#### count of (non distinct) words in each category

In [6]:
bow_true = []
for word in df[df.category == 'true'].cleaned.str.split().to_list():
    bow_true += word


In [7]:
bow_unreliable = []
for word in df[df.category == 'unreliable'].cleaned.str.split().to_list():
    bow_unreliable += word

In [8]:
len(bow_true)

2829

In [9]:
len(bow_unreliable)

3294

In [10]:
len(set(bow_true))

1089

In [11]:
len(set(bow_unreliable))

1583

In [12]:
words = bow_true + bow_unreliable

In [13]:
texts =  [(tweet, cat) for tweet, cat in zip(df.cleaned.to_list(), df.category.to_list())]

In [14]:
tweets, cats = (zip(*texts))

## Word Embeddings

In [15]:
testing_words = set(words)

In [16]:
we_cbow = pickle.load(open('../data/we_cbow_training_unshared.pickle', 'rb'))
we_sg = pickle.load(open('../data/we_sg_training_unshared.pickle', 'rb'))

In [17]:
training_words = pickle.load(open('../data/training_words_set.pickle', 'rb'))

In [18]:
len([word for word in testing_words if word not in training_words])

# this might create a significant issue

596

In [19]:
def meaner(word_embedding, tweets, training_words):
    # excludes words that are not in the training words
    tweet_embedding = {}
    for i, tweet in enumerate(tweets):
        tweet_embedding[tweet] = np.mean(np.array([word_embedding[word] for word in tweets[i].split() if word in training_words]), axis=0)
    return tweet_embedding

In [20]:
te_cbow = meaner(we_cbow, tweets, training_words)
te_sg = meaner(we_sg, tweets, training_words)

In [21]:
len(te_cbow), len(te_sg)

(557, 557)

In [22]:
te_cbow_df = pd.DataFrame(te_cbow).T.reset_index().rename(columns={'index':'tweet'})
te_cbow_df['category'] = cats
te_cbow_df['label'] = te_cbow_df.category.map({'unreliable':0, 'true':1})
te_cbow_df = te_cbow_df.sample(frac=1, random_state=42).reset_index(drop=True)

te_sg_df = pd.DataFrame(te_sg).T.reset_index().rename(columns={'index':'tweet'})
te_sg_df['category'] = cats
te_sg_df['label'] = te_sg_df.category.map({'unreliable':0, 'true':1})
te_sg_df = te_sg_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [23]:
te_cbow_df.head()

Unnamed: 0,tweet,0,1,2,3,4,5,6,7,8,...,292,293,294,295,296,297,298,299,category,label
0,nation health professional continue manage cor...,0.02661,-0.102835,-0.025486,-0.055305,-0.11843,-0.015232,0.373783,0.09982,-0.092851,...,0.048518,0.085304,0.090412,0.27926,0.164074,0.237289,-0.014596,0.084876,unreliable,0
1,toronto public health set coronavirus hotline ...,0.039418,-0.166046,-0.036451,-0.084716,-0.186868,-0.024942,0.587382,0.15404,-0.150155,...,0.078209,0.133449,0.142628,0.446401,0.260223,0.37386,-0.027158,0.135424,true,1
2,hey trumptrain official warns trump ignorant c...,0.050208,-0.147345,-0.039941,-0.087067,-0.19709,-0.018481,0.610433,0.158824,-0.170516,...,0.083069,0.109545,0.110459,0.427232,0.253073,0.402234,0.000706,0.138181,unreliable,0
3,lie us coronaviruse,0.04077,-0.113757,-0.022953,-0.057281,-0.147395,-0.015885,0.439306,0.089939,-0.129205,...,0.079528,0.056295,0.078036,0.316576,0.179258,0.309162,-0.020263,0.099624,unreliable,0
4,maga hat made china may infected coronavirus s...,0.047375,-0.134999,-0.035648,-0.073456,-0.151717,-0.0326,0.496991,0.105883,-0.123409,...,0.075033,0.090503,0.126417,0.3897,0.208518,0.317893,-0.049227,0.124209,unreliable,0


In [24]:
pickle.dump(te_cbow_df, open("../data/te_cbow_df_testing_unshared.pickle", "wb"))
pickle.dump(te_sg_df, open("../data/te_sg_df_testing_unshared.pickle", "wb"))