# Parse RumEval Twitter data

In [1]:
import os
import numpy as np
import pandas as pd
import pickle as pc
import dateutil.parser
from glob import glob
import json
import codecs

from nltk.tokenize.api import StringTokenizer
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE

In [2]:
import matplotlib.pyplot as plt

# Set font size
fS = 20

In [3]:
# Change to twitter data dir
os.chdir('/home/wmkouw/Dropbox/Projects/ucopenhagen/seq-rumour/data/RumEval2019')

In [4]:
# Get folder paths
twitter_path = 'twitter-english/charliehebdo/'
threads = os.listdir(twitter_path)

In [5]:
# Get labels
with open('train-key.json') as f:
    train_key = json.load(f)
    
with open('dev-key.json') as f:
    dev_key = json.load(f)
    
label_keys = {**train_key['subtaskaenglish'], **dev_key['subtaskaenglish']}
train_key['subtaskaenglish']

{'501760642928635904': 'comment',
 '500270212198174720': 'comment',
 '524971210275565568': 'comment',
 '552836882770690049': 'comment',
 '500289931097296897': 'support',
 '544293230649810944': 'comment',
 '544283032732332032': 'comment',
 '500406679176241153': 'comment',
 '500300182844878848': 'comment',
 '552811591683821568': 'comment',
 '524938343180931073': 'comment',
 '521360786409943041': 'deny',
 '524946532416884736': 'comment',
 '544347456238931968': 'comment',
 '525020658972258305': 'query',
 '552817834628501505': 'comment',
 '524959078229880832': 'comment',
 '553588165408473088': 'comment',
 '544293748377518080': 'comment',
 '553196346375954433': 'query',
 '524957146656821249': 'comment',
 '524992668301262848': 'deny',
 '500294827351621632': 'comment',
 '544307827070615553': 'support',
 '524955418955902976': 'comment',
 '553541495588802560': 'comment',
 '500301494193627136': 'comment',
 '525026551411519488': 'comment',
 '553565149790224387': 'comment',
 '544520601038098433': '

In [6]:
# Text array
tweet_id = []
thread_ix = [] 
response_ix = []
reply_ix = []
texts = []
created_date = []
created_datetime = []
labels = []

# Loop over threads
for t, thread in enumerate(threads):
    
    with open(twitter_path + thread + '/source-tweet/' + thread + '.json') as f:
        tweet = json.load(f)
        
        tweet_id.append(thread)
        thread_ix.append(t)
        reply_ix.append(0)
        texts.append(tweet['text'])
        created_date.append(dateutil.parser.parse(tweet['created_at']).date())
        created_datetime.append(dateutil.parser.parse(tweet['created_at']))  
        labels.append(label_keys[thread])
        
    replies = os.listdir(twitter_path + thread + '/replies/')
    for r, reply in enumerate(replies):
        
        with open(twitter_path + thread + '/replies/' + reply) as f:
            tweet = json.load(f)
                
            tweet_id.append(reply[:-5])
            thread_ix.append(t)
            reply_ix.append(r + 1)
            texts.append(tweet['text'])
            created_date.append(dateutil.parser.parse(tweet['created_at']).date())
            created_datetime.append(dateutil.parser.parse(tweet['created_at']))  
            labels.append(label_keys[reply[:-5]])
    

In [7]:
tweet

{'contributors': None,
 'truncated': False,
 'text': '@WSJ Well that guys a nut.',
 'in_reply_to_status_id': 553576010898497536,
 'id': 553576199268876288,
 'favorite_count': 1,
 'source': '<a href="https://mobile.twitter.com" rel="nofollow">Mobile Web (M5)</a>',
 'retweeted': False,
 'coordinates': None,
 'entities': {'symbols': [],
  'user_mentions': [{'id': 3108351,
    'indices': [0, 4],
    'id_str': '3108351',
    'screen_name': 'WSJ',
    'name': 'Wall Street Journal'}],
  'hashtags': [],
  'urls': []},
 'in_reply_to_screen_name': 'WSJ',
 'id_str': '553576199268876288',
 'retweet_count': 0,
 'in_reply_to_user_id': 3108351,
 'favorited': False,
 'user': {'follow_request_sent': False,
  'profile_use_background_image': True,
  'profile_text_color': '333333',
  'default_profile_image': False,
  'id': 509601309,
  'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png',
  'verified': False,
  'profile_location': None,
  'profile_image_url_https': 'ht

In [8]:
# Convert to dataframe
data = pd.DataFrame({'id': tweet_id,
                     'thread_ix': thread_ix,
                     'reply_ix': reply_ix,
                     'text': texts,
                     'created_date': created_date,
                     'created_datetime': created_datetime,
                     'label': labels})

In [9]:
# write frame to csv
data.to_csv('./RumEval19.csv', sep='\t', encoding='utf-8', index=False)

In [10]:
data

Unnamed: 0,created_date,created_datetime,id,label,reply_ix,text,thread_ix
0,2015-01-07,2015-01-07 11:28:49+00:00,552788945017516032,support,0,Appalled by the attack on Charlie Hebdo in Par...,0
1,2015-01-07,2015-01-07 12:01:13+00:00,552797099138748416,query,1,@katherine1924 @Mumbobee @tnewtondunn @mehdirh...,0
2,2015-01-07,2015-01-07 11:40:59+00:00,552792005559189504,comment,2,@tnewtondunn @mehdirhasan attacking religion ...,0
3,2015-01-07,2015-01-07 11:58:32+00:00,552796424266854400,comment,3,@m33ryg @tnewtondunn @mehdirhasan Can you supp...,0
4,2015-01-07,2015-01-07 11:52:24+00:00,552794881601859584,comment,4,@m33ryg @tnewtondunn @mehdirhasan You think pe...,0
5,2015-01-07,2015-01-07 11:29:22+00:00,552789083211460608,comment,5,@tnewtondunn You guys should put one of their ...,0
6,2015-01-07,2015-01-07 11:31:54+00:00,552789720540119040,comment,6,"@mjhsinclair @tnewtondunn Sadly, since none of...",0
7,2015-01-07,2015-01-07 12:04:05+00:00,552797821188206592,comment,7,@UnbiasedF If you go into such facts it will b...,0
8,2015-01-07,2015-01-07 11:59:36+00:00,552796692995907584,comment,8,@m33ryg @Mumbobee @tnewtondunn @mehdirhasan Th...,0
9,2015-01-07,2015-01-07 11:33:15+00:00,552790062778548224,comment,9,@tnewtondunn solidarity is key - we can't be m...,0


# Twitter word embeddings

In [11]:
os.chdir('/home/wmkouw/Dropbox/Projects/ucopenhagen/seq-rumour/data/word2vec-twitter')

In [12]:
# change 'xrange' in word2vecReader to 'range'
exec(open("repl.py").read())

Loading the model, this can take some time...
The vocabulary size is: 3039345


In [97]:
tt = TweetTokenizer()

num_tweets = len(data)

wemb = np.zeros((num_tweets, 400))
for n in range(num_tweets):
    
    aa = tt.tokenize(data['text'][n])
    
    ct = 0
    for a in aa:
        
        try:
            wemb[n, :] += model.__getitem__(a)
            ct += 1
        except:
            print('.', end='')
    
    # Average embeddings
    wemb[n, :] /= ct
    
# Add word embeddings to dataframe
data = data.assign(embedding=wemb.tolist())

# Write embbeding array separately
np.save('rumeval19.npy', wemb)

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................



........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................