In [1]:
import pandas as pd
import numpy as np
from glob2 import glob
import json

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
import re
from nltk.corpus import stopwords as stpdfa
from nltk.stem.porter import PorterStemmer

import gensim
import gensim.models.word2vec as w2v
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from datetime import datetime
from datetime import date
from datetime import timedelta

[nltk_data] Downloading package punkt to /Users/june/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/june/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 400)
# pd.set_option('display.max_rowwidth', 100)
pd.set_option('display.max_colwidth', 400)

# TOC
1. Data Import
    1. Feature Extraction
2. Text Vectorization
    2. Word2Vec
    3. Doc2Vec
4. Data Export as CSV files

# Data Import

## Functions

In [3]:
import random


def cross_val_jsons(jsonFiles, isTest):
    # output = [json for json in jsonFiles]
    # print(random.choice(jsonFiles))
    if (isTest == True):
        test = random.choice(jsonFiles)
        jsonFiles.remove(test)
        train = [json for event in jsonFiles for json in event]
        return train, test
    else:
        data = [json for event in jsonFiles for json in event]
        return [data]

def extract_data(datas):
    data_lists = []
    isRumorLists = []
    for index, dataset in enumerate(datas):
        data_list = []
        isRumorList = []
        count = 0 # help var

        for jsonFile in dataset:
            count+=1
            if jsonFile.find("non-rumours") == -1:
                isRumorList.append(1)
            else:
                isRumorList.append(0)

            with open (jsonFile, 'r') as f:
                for l in f.readlines():
                    if not l.strip (): # skip empty lines
                        continue
                    json_data = json.loads(l)
                    # print (json_data,"\n\n")
                    data_list.append(json_data)

        isRumorLists.append(pd.DataFrame(isRumorList, columns=['isRumor']))
        data_lists.append(data_list)
    return data_lists, isRumorLists

def printRumor(route):
    if route.find("rumours") == -1:
        print('non-rumors')
    else:
        print('rumor')

In [4]:
def capitalratio(tweet_text):
    uppers = [l for l in tweet_text if l.isupper()]
    capitalratio = len(uppers) / len(tweet_text)
    return capitalratio 

def tweets2tokens(tweet_text):
    # Tokenizing
    urls = []
    tokens = nltk.word_tokenize(re.sub(r'([^\s\w]|_)+','', tweet_text.lower()))
    tweet_text = re.sub(r"http\S+", "", tweet_text)
    # tokens = nltk.TweetTokenizer().tokenize(re.sub(r'([\d,.]+)','', tweet_text.lower()))

    # Setting url value (whether the tweet contains http link) and filter http links
    url=0
    for token in tokens:
        if token.startswith('http'):
            url=1
    tokens = [token for token in tokens if not token.startswith('http')]

    ## Stemming
    # porter = PorterStemmer()
    # tokens = [porter.stem(token) for token in tokens]

    # Filtering Stop words
    # from nltk.corpus import stopwords
    # stop_words = set(stopwords.words('english'))
    # tokens = [token for token in tokens if not token in stop_words]

    return tokens, url

def getposcount(tokens):
    postag = []
    poscount = {}
    poscount['Noun']=0
    poscount['Verb']=0
    poscount['Adjective'] = 0
    poscount['Pronoun']=0
    poscount['FirstPersonPronoun']=0
    poscount['SecondPersonPronoun']=0
    poscount['ThirdPersonPronoun']=0
    poscount['Adverb']=0
    poscount['Numeral']=0
    poscount['Conjunction_inj']=0
    poscount['Particle']=0
    poscount['Determiner']=0
    poscount['Modal']=0
    poscount['Whs']=0
    Nouns = {'NN','NNS','NNP','NNPS'}
    Adverbs = {'RB','RBR','RBS'}
    Whs = {'WDT','WP','WRB'} # Composition of wh-determiner(that,what), wh-pronoun(who), wh-adverb(how)
    Verbs={'VB','VBP','VBZ','VBN','VBG','VBD','To'}
    first_person_pronouns=['i','I','me','my','mine','we','us','our','ours'] #'i',
    second_person_pronouns=['you','your','yours']
    third_person_pronouns=['he','she','it','him','her','it','his','hers','its','they','them','their','theirs']

    for word in tokens:
        w_lower=word.lower()
        if w_lower in first_person_pronouns:
            poscount['FirstPersonPronoun']+=1
        elif w_lower in second_person_pronouns:
            poscount['SecondPersonPronoun']+=1
        elif w_lower in third_person_pronouns:
            poscount['ThirdPersonPronoun']+=1
    
    postag = nltk.pos_tag(tokens)
    for g1 in postag:
        if g1[1] in Nouns:
            poscount['Noun'] += 1
        elif g1[1] in Verbs:
            poscount['Verb']+= 1
        elif g1[1]=='ADJ'or g1[1]=='JJ':
            poscount['Adjective']+=1
        elif g1[1]=='PRP' or g1[1]=='PRON' or g1[1]=='PRP$':
            poscount['Pronoun']+=1
        elif g1[1] in Adverbs or g1[1]=='ADV':
            poscount['Adverb']+=1
        elif g1[1]=='CD':
            poscount['Numeral']+=1
        elif g1[1]=='CC' or g1[1]=='IN':
            poscount['Conjunction_inj']+=1
        elif g1[1]=='RP':
            poscount['Particle']+=1
        elif g1[1]=='MD':
            poscount['Modal']+=1
        elif g1[1]=='DT':
            poscount['Determiner']+=1
        elif g1[1] in Whs:
            poscount['Whs']+=1
    return poscount

def contentlength(words):
    wordcount = len(words)
    return wordcount

def extract_urls(entities_dicts):
    if len(entities_dicts) < 1:
        return 0,[],[]

    urls = []
    urls_expanded = []

    key = 'url'
    key2 = 'expanded_url'
    # print(len(entities_dict))
    for i in entities_dicts:
        urls.append(i[key])
        urls_expanded.append(i[key2])
    return 1, urls, urls_expanded

def flatten_tweets(tweets):
    """ Flattens out tweet dictionaries so relevant JSON is in a top-level dictionary. """
    tweets_list = []
    total_tokens_l = []

    # Iterate through each tweet
    for tweet_obj in tweets:
        output_f = dict()

        output_f['text']= tweet_obj['text']
        
        urls_dicts = tweet_obj['entities']['urls']
        # print(urls_dicts)

        output_f['hasURL'], output_f['urls'], output_f['urls_expanded'] = extract_urls(urls_dicts)
        
        # print(type(tweet_obj['user']))
        # print(tweet_obj['user'].contains_key('entities'))
        if ('url' in tweet_obj['user']):
            output_f['hasUserURL'] = 1
            output_f['user_url'] = tweet_obj['user']['url']
        elif ('entities' in tweet_obj['user']):
            # output_f['user_entity'] = tweet_obj['user']['entities']['url']['urls']
            # print(tweet_obj['user']['entities']['url']['urls'])
            # output_f['user_url'] = tweet_obj['user']['entities']['expanded_url']
            output_f['hasUserURL'] , _ , output_f['user_url'] = extract_urls(tweet_obj['user']['entities']['url']['urls'])
        else:
            # output_f['user_entity'] = None
            output_f['user_url'] = 0
            output_f['hasUserURL'] = 0

        output_f['text_token'], output_f['isNotOnlyText'] = tweets2tokens(tweet_obj['text'])
        total_tokens_l.extend(output_f['text_token']) # append the tokens to list of total tokens

        '''POS Tagging'''
        pos_dict=getposcount(output_f['text_token'])
        output_f.update(pos_dict)

        output_f['char_count'] = len(output_f['text'])
        output_f['word_count'] = len(output_f['text_token'])

        output_f['has_question'] = "?" in output_f["text"]
        output_f['has_exclaim'] = "!" in output_f["text"]
        output_f['has_period'] = "." in output_f["text"]
    
        ''' User info'''
        # Store the user screen name in 'user-screen_name'
        # output_f['user-screen_name'] = tweet_obj['user']['screen_name']
        
        # Store the user location
        # output_f['user-location'] = tweet_obj['user']['location']

        acc_created = datetime.strptime(tweet_obj['user']['created_at'], '%a %b %d %H:%M:%S %z %Y')
        tweet_created = datetime.strptime(tweet_obj['created_at'], '%a %b %d %H:%M:%S %z %Y')
        age = (tweet_created - acc_created)
        # print(type(timedelta.total_seconds(age)))

        output_f['capital_ratio']=(capitalratio(tweet_obj['text']))

        # features=(capitalratio(data_list[0]['user']))
        output_f['tweet_count'] = np.log10(tweet_obj['user']['statuses_count'])
        output_f['listed_count'] = np.log10(tweet_obj['user']['listed_count'])
        output_f['follow_ratio'] = np.log10(tweet_obj['user']['followers_count'])
        output_f['age'] = int(timedelta.total_seconds(age)/86400)
        output_f['verified'] = tweet_obj['user']['verified']

        tweets_list.append(output_f)

    unk_tokens_l = list(set(total_tokens_l))
    print("Number of total tokens appeared: {}\nNumber of unique tokens appeared: {}\n".format(len(total_tokens_l), len(unk_tokens_l))) # number of tokens and unique tokens

    return tweets_list

In [5]:
class data_loader():

    def getdata(self, root=True, reaction=False, split=True):
        lists = []
        if root == True:
            charliehebdo_jsons = glob('../pheme-rnr-dataset/charliehebdo/**/source-tweet/*.json') 
            ferguson_jsons = glob('../pheme-rnr-dataset/ferguson/**/source-tweet/*.json')
            germanwing_scrash_jsons = glob('../pheme-rnr-dataset/germanwings-crash/**/source-tweet/*.json')
            ottawashooting_jsons = glob('../pheme-rnr-dataset/ottawashooting/**/source-tweet/*.json')
            sydneysiege_jsons = glob('../pheme-rnr-dataset/sydneysiege/**/source-tweet/*.json')
            print(len(charliehebdo_jsons),len(ferguson_jsons),len(germanwing_scrash_jsons),len(ottawashooting_jsons),len(sydneysiege_jsons))
            lists.append([charliehebdo_jsons, ferguson_jsons, germanwing_scrash_jsons, ottawashooting_jsons, sydneysiege_jsons])
        elif reaction == True:
            charliehebdo_reaction = glob('../pheme-rnr-dataset/charliehebdo/**/reactions/*.json') 
            ferguson_reaction = glob('../pheme-rnr-dataset/ferguson/**/reactions/*.json')
            germanwing_scrash_reaction = glob('../pheme-rnr-dataset/germanwings-crash/**/reactions/*.json')
            ottawashooting_reaction = glob('../pheme-rnr-dataset/ottawashooting/**/reactions/*.json')
            sydneysiege_reaction = glob('../pheme-rnr-dataset/sydneysiege/**/reactions/*.json')
            lists.append([charliehebdo_reaction, ferguson_reaction, germanwing_scrash_reaction, ottawashooting_reaction, sydneysiege_reaction])
            
        return lists

In [6]:
jsonFiles = data_loader().getdata()


2079 1143 469 890 1221


## Importing JSON Files and grouping

In [6]:
jsonFiles = data_loader().getdata()
data = cross_val_jsons(jsonFiles[0], isTest = True)
# data = cross_val_jsons(jsonFiles[0], isTest = False)

2079 1143 469 890 1221


In [7]:
data_lists, isRumorLists = extract_data(data)

train, test = data_lists[0], data_lists[1]
df_train_y, df_test_y = isRumorLists[0],isRumorLists[1]
print("(Data of Root tweets) Train: {} Test: {}".format(len(train),len(test)))
print("(Data of Root tweets) Train_y: {} Test_y: {}".format(len(isRumorLists[0]),len(isRumorLists[1])))

(Data of Root tweets) Train: 3723 Test: 2079
(Data of Root tweets) Train_y: 3723 Test_y: 2079


In [9]:
pd.DataFrame(np.array(data_lists))

Unnamed: 0,0
0,"[{'contributors': None, 'truncated': False, 'text': 'Michael Brown was suspected of robbing store in #Ferguson before being shot, say police http://t.co/KUZQGasFgA http://t.co/TjuEVB8z5m', 'in_reply_to_status_id': None, 'id': 500311153583853570, 'favorite_count': 88, 'source': '<a href=""http://www.socialflow.com"" rel=""nofollow"">SocialFlow</a>', 'retweeted': False, 'coordinates': None, 'entitie..."
1,"[{'contributors': None, 'truncated': False, 'text': 'BREAKING: Armed man takes hostage in kosher grocery east of Paris http://t.co/PBs3sMwhLt', 'in_reply_to_status_id': None, 'id': 553529101659566080, 'favorite_count': 14, 'source': '<a href=""https://about.twitter.com/products/tweetdeck"" rel=""nofollow"">TweetDeck</a>', 'retweeted': False, 'coordinates': None, 'entities': {'user_mentions': [], '..."


In [None]:
data_lists[0]

In [8]:
jsonFiles = data_loader().getdata()
data = cross_val_jsons(jsonFiles[0], isTest = False)
data_lists, isRumorLists = extract_data(data)

data = data_lists[0]
data_y = isRumorLists[0]
print("(Data of Root tweets) Data: {}".format(len(data)))
print("(Data of Root tweets) Label: {}".format(len(data_y)))

(Data of Root tweets) Data: 5802
(Data of Root tweets) Label: 5802


## Flatten and extract basic features

In [10]:
df_train = pd.DataFrame(flatten_tweets(train))
df_test = pd.DataFrame(flatten_tweets(test))
df_data = pd.DataFrame(flatten_tweets(data))

Number of total tokens appeared: 69679
Number of unique tokens appeared: 6761

Number of total tokens appeared: 18496
Number of unique tokens appeared: 3149

Number of total tokens appeared: 88175
Number of unique tokens appeared: 8154



In [11]:
df_data

Unnamed: 0,text,hasURL,urls,urls_expanded,hasUserURL,user_url,text_token,isNotOnlyText,Noun,Verb,Adjective,Pronoun,FirstPersonPronoun,SecondPersonPronoun,ThirdPersonPronoun,Adverb,Numeral,Conjunction_inj,Particle,Determiner,Modal,Whs,char_count,word_count,has_question,has_exclaim,has_period,capital_ratio,tweet_count,listed_count,follow_ratio,age,verified
0,BREAKING: Armed man takes hostage in kosher grocery east of Paris http://t.co/PBs3sMwhLt,1,[http://t.co/PBs3sMwhLt],[http://htz.li/1lI],1,http://www.haaretz.com,"[breaking, armed, man, takes, hostage, in, kosher, grocery, east, of, paris]",1,6,3,0,0,0,0,0,0,0,2,0,0,0,0,88,11,False,False,True,0.159091,4.803286,3.855943,5.287349,2126,True
1,"#CharlieHebdo killers dead, confirmed by gendarmerie.",0,[],[],1,http://www.agnespoirier.org,"[charliehebdo, killers, dead, confirmed, by, gendarmerie]",0,2,1,2,0,0,0,0,0,0,1,0,0,0,0,53,6,False,False,True,0.037736,3.031812,2.146128,3.672929,1050,False
2,"Top French cartoonists Charb, Cabu, Wolinski, Tignous confirmed among dead in #Paris #CharlieHebdo attack. Editor is critically wounded.",0,[],[],1,http://t.co/EWb7m4orG8,"[top, french, cartoonists, charb, cabu, wolinski, tignous, confirmed, among, dead, in, paris, charliehebdo, attack, editor, is, critically, wounded]",0,3,4,8,0,0,0,0,1,0,2,0,0,0,0,136,18,False,False,True,0.073529,3.856245,2.879669,4.309651,2030,False
3,Police have surrounded the area where the #CharlieHebdo attack suspects are believed to be: http://t.co/3tGXEIX4F2\nhttps://t.co/aBSezf2QWS,1,"[http://t.co/3tGXEIX4F2, https://t.co/aBSezf2QWS]","[http://cnn.it/1xYDHvp, https://amp.twimg.com/v/7d7ecf3e-0965-41ca-afa6-81397b4854db]",1,http://www.cnn.com,"[police, have, surrounded, the, area, where, the, charliehebdo, attack, suspects, are, believed, to, be]",1,5,5,0,0,0,0,0,0,0,0,0,2,0,1,138,14,False,False,True,0.101449,4.735814,5.009820,7.187664,2891,True
4,PHOTO: Armed gunmen face police officers near #CharlieHebdo HQ in Paris http://t.co/3Jsosc7yl3 http://t.co/iOpVNO6Iq0,1,[http://t.co/3Jsosc7yl3],[http://on.rt.com/k5ivya],1,http://t.co/bDDyvy9DmR,"[photo, armed, gunmen, face, police, officers, near, charliehebdo, hq, in, paris]",1,7,2,0,0,0,0,0,0,0,2,0,0,0,0,117,11,False,False,True,0.145299,5.021181,4.132996,5.925434,1975,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5797,'I'll ride with you' http://t.co/llZnuCAzg5 Australia unites during #SydneySiege http://t.co/WIU22VPgkz,1,[http://t.co/llZnuCAzg5],[http://bbc.in/1DArzWS],1,http://bbc.com/trending,"[ill, ride, with, you, australia, unites, during, sydneysiege]",1,2,2,1,1,0,1,0,0,0,2,0,0,0,0,103,8,False,False,True,0.116505,3.586024,2.876795,4.716070,531,True
5798,Canada's thoughts and prayers are with our Australian friends. #MartinPlace #SydneySiege,0,[],[],1,http://t.co/L2SH1QDkAY,"[canadas, thoughts, and, prayers, are, with, our, australian, friends, martinplace, sydneysiege]",0,5,2,1,1,1,0,0,0,0,2,0,0,0,0,88,11,False,False,True,0.068182,3.466423,3.893429,5.800086,2713,True
5799,Every non-muslim in the world must watch this video https://t.co/sZdyhISoVh &amp; show it every other non-muslim! #sydneysiege,1,[https://t.co/sZdyhISoVh],[https://www.youtube.com/watch?v=d8c38_46W5c],1,http://t.co/uTUmOx49Zj,"[every, nonmuslim, in, the, world, must, watch, this, video, amp, show, it, every, other, nonmuslim, sydneysiege]",1,4,3,2,1,0,0,1,0,0,1,0,4,1,0,126,16,False,True,True,0.039683,3.681151,1.924279,4.557531,549,False
5800,"Suspect in Sydney cafe siege identified as Man Haron Monis, an Iranian granted asylum in Australia http://t.co/6Lrl9DEMXA",1,[http://t.co/6Lrl9DEMXA],[http://bbc.in/1znVJHB],1,http://www.bbc.co.uk/news,"[suspect, in, sydney, cafe, siege, identified, as, man, haron, monis, an, iranian, granted, asylum, in, australia]",1,7,3,2,0,0,0,0,0,0,3,0,1,0,0,121,16,False,False,True,0.107438,4.349879,5.007671,7.097693,2793,True


In [23]:
df_train[['has_question', 'has_exclaim', 'has_period','verified']] = df_train[['has_question', 'has_exclaim', 'has_period','verified']].astype(int)
df_test[['has_question', 'has_exclaim', 'has_period','verified']] = df_test[['has_question', 'has_exclaim', 'has_period','verified']].astype(int)
df_data[['has_question', 'has_exclaim', 'has_period','verified']] = df_data[['has_question', 'has_exclaim', 'has_period','verified']].astype(int)

## Handling inf and NaN value

In [24]:
# print(np.any(np.isnan(X)))
# print(np.all(np.isfinite(X)))
# print(X['listed_count'].mean())
# print(pd.DataFrame(X['listed_count'].replace([np.inf, -np.inf], np.nan)).mean())

# listed_nan = X[['listed_count','tweet_count','follow_ratio','age']].loc[X['listed_count']<0.2]
# listed_nan = listed_nan.replace([np.inf, -np.inf], np.nan)
# np.isnan(listed_nan).sum()

In [25]:
for dataset in [df_train, df_test, df_data]:
    dataset['listed_count'].replace([np.inf, -np.inf], np.nan, inplace=True)
    print(dataset['listed_count'].mean())
    print("Before fill: Does the dataset contain NaN value? {}".format(np.any(np.isnan(dataset['listed_count']))))
    dataset['listed_count'].fillna(0,inplace=True)
    print("After fill: Does the dataset contain NaN value? {}".format(np.any(np.isnan(dataset['listed_count']))))

3.2022264175363997
Before fill: Does the dataset contain NaN value? True
After fill: Does the dataset contain NaN value? False
2.6928269219616747
Before fill: Does the dataset contain NaN value? True
After fill: Does the dataset contain NaN value? False
3.1020948474234245
Before fill: Does the dataset contain NaN value? True
After fill: Does the dataset contain NaN value? False


In [113]:
df_data = pd.concat([df_data,data_y],axis=1)
df_data.to_csv('./data/data_notembeded.csv', index = False)

### Reaction files

In [None]:
reactionFiles = data_loader().getdata(reaction=True,root=False)
data_rt = cross_val_jsons(reactionFiles[0], isTest = False)
data_lists, isRumorLists = extract_data(data_rt)

data_rt = data_lists[0]
data_rt_y = isRumorLists[0]
print("(Data of Root tweets) Data: {}".format(len(data_rt)))
print("(Data of Root tweets) Label: {}".format(len(data_rt_y)))

df_data_rt = pd.DataFrame(flatten_tweets(data_rt))

df_data_rt[['has_question', 'has_exclaim', 'has_period','verified']] = df_data_rt[['has_question', 'has_exclaim', 'has_period','verified']].astype(int)

for dataset in [df_data_rt]:
    dataset['listed_count'].replace([np.inf, -np.inf], np.nan, inplace=True)
    print(dataset['listed_count'].mean())
    print("Before fill: Does the dataset contain NaN value? {}".format(np.any(np.isnan(dataset['listed_count']))))
    dataset['listed_count'].fillna(0,inplace=True)
    print("After fill: Does the dataset contain NaN value? {}".format(np.any(np.isnan(dataset['listed_count']))))
print(data_rt.shape, data_rt_y.shape)
df_data_rt.to_csv('./data/data_rt_notembeded.csv', index = False)

# Word2vec

In [72]:
word2vec_ = w2v.Word2Vec(
    df_train['text_token'],
    sg = 1, 
    seed = 1,
    workers = 8,
    size = 300,
    min_count = 5,
    window = 10,
    sample = 1e-3
)

In [74]:
word2vec_.train(df_train['text_token'], total_examples = word2vec_.corpus_count, epochs = word2vec_.iter)

(246411, 372950)

In [75]:
# word2vec_.wv.vectors.shape # vocab size / window size

In [76]:
word2vec_.save('w2v_model')

### Load Word Vector model and Vectorize the sentence

In [77]:
word2vec_ = w2v.Word2Vec.load('w2v_model')

In [9]:
word_vectors = word2vec_.wv
vocabs = word_vectors.vocab.keys()

NameError: name 'word2vec_' is not defined

In [79]:
w2v_vectors = word_vectors.vectors # here you load vectors for each word in your model
w2v_indices = {word: word_vectors.vocab[word].index for word in word_vectors.vocab} # here you load indices - with whom you can find an index of the particular word in your model 

In [80]:
# word2vec_.most_similar('liniers')

In [39]:
def vectorize(line): 
    words = []
    for word in line: # line - iterable, for example list of tokens 
        try:
            w2v_idx = w2v_indices[word]
        except KeyError: # if you does not have a vector for this word in your w2v model, continue 
            continue
        words.append(list(w2v_vectors[w2v_idx]))
        if not word:
            words.append(None)

        if len(line) > len(words):
            continue
    return np.asarray(words)

In [82]:
# print("Tweet 1: ", df['text'][1])
# print("Indice of '{}': {}".format(df['text_token'][1][0], w2v_indices[df['text_token'][1][0]]))
# print("Indice of '{}': {}".format(df['text_token'][1][0], w2v_vectors[w2v_indices[df['text_token'][1][0]]]))
# print("Indice of '{}': {}".format(df['text_token'][1][1], w2v_indices[df['text_token'][1][1]]))
# print("Indice of '{}': {}".format(df['text_token'][1][1], w2v_vectors[w2v_indices[df['text_token'][1][1]]]))
# print("\nVector of the first headline:\n", vectorize(df['text_token'][1]))

## Average of Vectors & Previous features

In [83]:
import copy
df_train['text_token_vec'] = copy.deepcopy(df_train['text_token'])

for index, sentence in enumerate(df_train['text_token_vec']):
    df_train['text_token_vec'][index] = vectorize(sentence).mean(axis=0)

pd.DataFrame(df_train['text_token_vec'].values.tolist()).shape
df_train[['text_token','text_token_vec']].head()
df_train_avg = pd.DataFrame(df_train['text_token_vec'].values.tolist()).add_prefix('token_avg') #.join(df)
df_train_avg = df_train.join(df_train_avg).drop('text_token_vec',axis=1)
df_train.drop(['text_token_vec'],axis=1,inplace=True)

In [98]:
df_train_avg.head(2)

Unnamed: 0,text,hasURL,urls,urls_expanded,hasUserURL,user_url,text_token,isNotOnlyText,Noun,Verb,...,token_avg290,token_avg291,token_avg292,token_avg293,token_avg294,token_avg295,token_avg296,token_avg297,token_avg298,token_avg299
0,BREAKING: Armed man takes hostage in kosher gr...,1,[http://t.co/PBs3sMwhLt],[http://htz.li/1lI],1,http://www.haaretz.com,"[breaking, armed, man, takes, hostage, in, kos...",1,6,3,...,0.062943,0.132718,0.133037,-0.026727,0.031101,-0.130553,-0.055182,0.257094,0.000888,-0.071521
1,"#CharlieHebdo killers dead, confirmed by genda...",0,[],[],1,http://www.agnespoirier.org,"[charliehebdo, killers, dead, confirmed, by, g...",0,2,1,...,0.043733,0.111315,-0.040369,-0.014255,-0.099877,-0.110036,0.002233,0.145399,-0.059819,-0.187999


## Same work on X test 

In [225]:
import copy
df_test['text_token_vec'] = copy.deepcopy(df_test['text_token'])

for index, sentence in enumerate(df_test['text_token_vec']):
    df_test['text_token_vec'][index] = vectorize(sentence).mean(axis=0)

# df_test[['text_token','text_token_vec']].head()

df_test_avg = pd.DataFrame(df_test['text_token_vec'].values.tolist()).add_prefix('token_avg')

df_test_avg = df_test.join(df_test_avg).drop('text_token_vec',axis=1)
df_test.drop('text_token_vec',axis=1, inplace=True)

In [86]:
df_test_avg

Unnamed: 0,token_avg0,token_avg1,token_avg2,token_avg3,token_avg4,token_avg5,token_avg6,token_avg7,token_avg8,token_avg9,...,token_avg290,token_avg291,token_avg292,token_avg293,token_avg294,token_avg295,token_avg296,token_avg297,token_avg298,token_avg299
0,0.154436,0.042025,0.106603,-0.018409,0.055064,-0.110473,0.039882,0.002460,0.129744,0.058930,...,0.054728,0.009576,0.050590,-0.022001,-0.033597,-0.091051,-0.020110,0.163158,-0.010016,-0.018631
1,0.146316,-0.049514,0.130602,-0.045639,0.176577,-0.024466,0.055755,0.086503,0.113821,0.158433,...,0.052342,0.076558,0.056597,-0.020453,-0.081126,-0.084036,-0.028039,0.155239,-0.053626,-0.072127
2,0.244598,0.058895,0.154219,0.048585,0.136651,-0.036705,0.037018,0.064442,0.118488,0.129758,...,0.121074,0.190364,0.090983,-0.070175,-0.104205,-0.147921,-0.102687,0.291595,-0.087664,-0.141094
3,0.188610,-0.048544,0.041357,0.004833,0.138942,-0.055199,0.036372,0.059581,0.126967,0.102629,...,0.012302,0.096600,0.010848,-0.010891,-0.035067,-0.102598,-0.024998,0.137281,-0.099979,-0.037174
4,0.156274,-0.046768,0.135716,-0.028251,0.143405,-0.055834,0.020583,0.052493,0.124096,0.153234,...,0.001162,0.037805,0.047072,-0.028495,-0.064918,-0.107768,-0.049309,0.141919,-0.051378,-0.076649
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,0.189939,-0.091825,0.069886,-0.041547,0.160320,-0.005779,0.064590,0.092563,0.062800,0.122943,...,0.088572,0.095731,0.077700,-0.037176,-0.184658,-0.210214,-0.025299,0.182387,-0.119329,-0.084234
886,-0.046209,-0.078544,0.176929,-0.086290,0.044155,-0.112407,-0.064324,-0.039424,0.138600,0.129153,...,0.013887,-0.049760,0.096969,-0.020735,-0.112816,-0.048133,-0.023543,0.181349,0.036703,0.029212
887,0.262207,-0.051198,0.106294,0.046314,0.092977,-0.108752,-0.044362,-0.002959,0.155272,0.059984,...,0.073013,0.110786,0.063698,0.008327,-0.052947,-0.167440,-0.058933,0.194866,-0.023708,-0.078216
888,0.089673,-0.058084,0.181287,-0.083148,0.099980,-0.044123,-0.007683,0.061265,0.086505,0.138202,...,0.069641,0.003106,0.080451,-0.022685,-0.121224,-0.090532,-0.009335,0.123697,-0.010751,-0.057883


In [87]:
print(df_test_avg.shape)
df_test_avg.head()

(890, 333)


Unnamed: 0,text,hasURL,urls,urls_expanded,hasUserURL,user_url,text_token,isNotOnlyText,Noun,Verb,...,token_avg290,token_avg291,token_avg292,token_avg293,token_avg294,token_avg295,token_avg296,token_avg297,token_avg298,token_avg299
0,EXTENDED: Dramatic video of gunfire inside hal...,1,[http://t.co/SbOu4rAp96],[http://www.ctvnews.ca/video?clipId=472781&pla...,1,http://t.co/1kTbzaumUY,"[extended, dramatic, video, of, gunfire, insid...",1,7,1,...,0.054728,0.009576,0.05059,-0.022001,-0.033597,-0.091051,-0.02011,0.163158,-0.010016,-0.018631
1,Police have clarified that there were two shoo...,0,[],[],1,http://t.co/sIkkpnZFhH,"[police, have, clarified, that, there, were, t...",0,8,3,...,0.052342,0.076558,0.056597,-0.020453,-0.081126,-0.084036,-0.028039,0.155239,-0.053626,-0.072127
2,Soldier killed in Ottawa identified as Cpl. Na...,1,[http://t.co/AOT1ZKyAei],[http://ottawa.ctvnews.ca/video?clipId=473273&...,1,http://ottawa.ctvnews.ca/,"[soldier, killed, in, ottawa, identified, as, ...",1,3,2,...,0.121074,0.190364,0.090983,-0.070175,-0.104205,-0.147921,-0.102687,0.291595,-0.087664,-0.141094
3,NORAD increases number of planes on higher ale...,1,[http://t.co/qsAnGNqBEw],[http://cnn.it/1teSHUE],1,http://t.co/kdkv08KSgi,"[norad, increases, number, of, planes, on, hig...",1,7,3,...,0.012302,0.0966,0.010848,-0.010891,-0.035067,-0.102598,-0.024998,0.137281,-0.099979,-0.037174
4,All 3 patients injured in #OttawaShooting rele...,0,[],[],1,http://t.co/vAXH6cjeS8,"[all, 3, patients, injured, in, ottawashooting...",0,4,3,...,0.001162,0.037805,0.047072,-0.028495,-0.064918,-0.107768,-0.049309,0.141919,-0.051378,-0.076649


# Doc2Vec

## Training

In [88]:
#Doc2vec 실행
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(df_train['text_token'])]
Doc2vec_model = Doc2Vec(vector_size=300, min_alpha=0.025, window=10, min_count=1, workers=4, epochs=120) #documents,
Doc2vec_model.build_vocab(documents)

Doc2vec_model.train(documents, epochs=Doc2vec_model.epochs, total_examples=Doc2vec_model.corpus_count)

print("The Dimension of a vector is: {}".format(len(Doc2vec_model.docvecs[0]))) # document vector의 정보

The Dimension of a vector is: 300


In [89]:
def most_similar_text(index):
    similar = Doc2vec_model.docvecs.most_similar(index)

    print("The quried text: \n\n{} \n\nis most similar to the text:\n\n{}".format(df_train['text'][index],df_train['text'][similar[0][0]]))

# most_similar_text(100)

In [90]:
Doc2vec_model.save('./model/d2v_model')

## Convert to feature for df (Not Necessary)

In [29]:
list_doc = []
for index, tokens in enumerate(df_train['text_token']):
    list_doc.append(Doc2vec_model.docvecs[index])
    
df_train_doc = pd.DataFrame(list_doc).add_prefix('doc_vec')
df_train_doc = df_train.join(df_train_doc)
df_train_doc.head(2)

NameError: name 'Doc2vec_model' is not defined

## Infer the document vectors from trained Doc2Vec Model

In [91]:
Doc2vec_model = Doc2Vec.load('./model/d2v_model')

### Train data

In [92]:
df_train['text_token_doc'] = copy.deepcopy(df_train_X['text_token'])

for index, sentence in enumerate(df_train['text_token_doc']):
    df_train['text_token_doc'][index] = Doc2vec_model.infer_vector(df_train['text_token_doc'][index],steps=50)

# df_test_X[['text_token','text_token_doc']].head()
df_train_doc = pd.DataFrame(df_train['text_token_doc'].values.tolist()).add_prefix('doc_vec')

df_train_doc = df_train.join(df_train_doc).drop('text_token_doc',axis=1)
# df_train_X_doc2.drop('text_token_doc', axis=1, inplace=True)

### Test data

In [93]:
df_test['text_token_doc'] = copy.deepcopy(df_test_X['text_token'])

for index, sentence in enumerate(df_test['text_token_doc']):
    df_test['text_token_doc'][index] = Doc2vec_model.infer_vector(df_test['text_token_doc'][index],steps=50)

# df_test[['text_token','text_token_doc']].head()
df_test_doc = pd.DataFrame(df_test['text_token_doc'].values.tolist()).add_prefix('doc_vec')

df_test_doc = df_test.join(df_test_doc).drop('text_token_doc',axis=1)
df_test.drop('text_token_doc', axis=1, inplace=True)

# Exporting data

In [84]:
# train:  -> df_train_doc | df_train_avg -> 둘다 text/text_token을 드랍해야함
# test: df_test -> df_test_doc | df_test_avg

In [85]:
# '''Target Variable set'''
# df_train_y.to_csv('./data/train_y.csv', index = False)
# df_test_y.to_csv('./data/test_y.csv', index = False)

In [100]:
'''Word2Vec Averaged Vector Feature set'''
df_train_avg.to_csv('./data/train_avg.csv', index = False)
df_test_avg.to_csv('./data/test_avg.csv', index = False)

In [53]:
'''Doc2vec Vector Feature set'''
df_train_doc.to_csv('./data/train_doc.csv', index = False)
df_test_doc.to_csv('./data/test_doc.csv', index = False)

In [None]:
'''Export data without word vector'''
data.to_csv('./data/X_basic.csv', index = False)
data2.to_csv('./data/y_basic.csv', index = False)

In [None]:
df_train_avg = pd.concat([df_train_y,df_train_X_avg],axis=1)
df_test_avg = pd.concat([df_test_y,df_test_X_avg],axis=1)
df_train_doc = pd.concat([df_train_y,df_train_X_doc],axis=1)
df_test_doc = pd.concat([df_test_y,df_test_X_doc],axis=1)

# Validation data

In [None]:
import copy

In [114]:
gurlitt_jsons = glob('../PHEME/all-rnr-annotated-threads/gurlitt-all-rnr-threads/**/source-tweets/*.json')
ebolaessien_jsons = glob('../PHEME/all-rnr-annotated-threads/ebola-essien-all-rnr-threads/**/source-tweets/*.json')
putinmissing_jsons = glob('../PHEME/all-rnr-annotated-threads/putinmissing-all-rnr-threads/**/source-tweets/*.json')

added_files = [gurlitt_jsons, ebolaessien_jsons, putinmissing_jsons]

valid_data = cross_val_jsons(added_files, False)
data_lists, isRumorLists = extract_data(valid_data)
X_valid = data_lists[0]
y_valid = isRumorLists[0]
print("(Data of Root tweets) X: {} y: {}".format(len(X_valid),len(y_valid)))

df_valid_X = pd.DataFrame(flatten_tweets(X_valid))
df_valid = pd.concat([df_valid_X,y_valid],axis=1)
df_valid[['has_question', 'has_exclaim', 'has_period','verified']] = df_valid[['has_question', 'has_exclaim', 'has_period','verified']].astype(int)

for dataset in [df_valid]:
    dataset['listed_count'].replace([np.inf, -np.inf], np.nan, inplace=True)
    # print(dataset['listed_count'].mean())
    print("Before fill: Does the dataset contain NaN value? {}".format(np.any(np.isnan(dataset['listed_count']))))
    dataset['listed_count'].fillna(0,inplace=True)
    print("After fill: Does the dataset contain NaN value? {}".format(np.any(np.isnan(dataset['listed_count']))))
print("\nDropping tweets with short length (<10)....\n",df_valid.shape)
df_valid.to_csv('./data/data_valid_notembeded.csv', index = False)

df_valid.drop(df_valid[df_valid['word_count'] < 10].index, inplace=True)
df_valid.reset_index(drop=True, inplace=True)
print(" ->",df_valid.shape)

(Data of Root tweets) X: 390 y: 390
Number of total tokens appeared: 5637
Number of unique tokens appeared: 1516

Before fill: Does the dataset contain NaN value? True
After fill: Does the dataset contain NaN value? False

Dropping tweets with short length (<10)....
 (390, 34)
 -> (328, 34)


In [106]:
word2vec_ = w2v.Word2Vec.load('w2v_model')
word_vectors = word2vec_.wv
w2v_vectors = word_vectors.vectors # here you load vectors for each word in your model
w2v_indices = {word: word_vectors.vocab[word].index for word in word_vectors.vocab} # here you load indices - with whom you can find an index of the particular word in your model 

df_valid['text_token_vec'] = copy.deepcopy(df_valid['text_token'])

for index, sentence in enumerate(df_valid['text_token_vec']):
    df_valid['text_token_vec'][index] = vectorize(sentence).mean(axis=0)

avg = pd.DataFrame(df_valid['text_token_vec'].values.tolist()).add_prefix('token_avg')
df_valid_avg = df_valid.join(avg).drop('text_token_vec',axis=1)

df_valid.drop(['text_token_vec'],axis=1,inplace=True)

In [110]:
Doc2vec_model = Doc2Vec.load('./model/d2v_model')
df_valid['text_token_doc'] = copy.deepcopy(df_valid['text_token'])

for index, sentence in enumerate(df_valid['text_token_doc']):
    df_valid['text_token_doc'][index] = Doc2vec_model.infer_vector(df_valid['text_token_doc'][index],steps=50)

doc = pd.DataFrame(df_valid['text_token_doc'].values.tolist()).add_prefix('doc_vec')
df_valid_doc = df_valid.join(doc).drop('text_token_doc',axis=1)

df_valid.drop(['text_token_doc'],axis=1,inplace=True)

In [112]:
df_valid_avg.to_csv('./data/valid_avg.csv', index = False)
df_valid_doc.to_csv('./data/valid_doc.csv', index = False)