# Answers to coding questions in deep learning
Peeter Niidas

In [1]:
# Make some preparations

import glob
import json
import pandas as pd
import numpy as np
import re
import spacy
from gensim.models import Word2Vec
import os
import sys
from time import time

import matplotlib.pyplot as plt
#%tensorflow_version 1.x

import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

1. Download either the PHEME5 or PHEME9 dataset.
2. Choose any one event from the dataset and load its first 100 JSON files into
python memory.
3. Extract and load attributes such as source tweets, reply tweets, and favorites
count, labels of the source tweets.

In [2]:
# Get filenames and load data

def read_rename_data (dirr, length):
    path = dirr
    df = pd.DataFrame()
    for filename in glob.iglob(path + '**/*.json', recursive=True):
        if 'source-tweet' in filename:
            i = 1
            f = open(filename)
            data = json.load(f)
            new_row = {'source_tweet': data['text'], 'favorite_count': data['favorite_count'], 'retweet_count':data['retweet_count']}
            df = df.append(new_row, ignore_index=True)
            f.close()
        elif 'reactions' in filename:
            f = open(filename)
            data = json.load(f)
            new_name = 'reply_' + str(i)
            df.loc[df.index[-1], new_name] = data['text']
            i += 1
        if df.shape[0] == length:
            return df
    return df

In [34]:
df_true = read_rename_data('./data/pheme-rnr-dataset/sydneysiege/non-rumours/', 100)
df_true.shape

  self.obj[key] = infer_fill_value(value)


(100, 344)

In [4]:
df_rumors = read_rename_data('./data/pheme-rnr-dataset/sydneysiege/rumours/', 100)
df_rumors.shape

(100, 94)

In [35]:
df_true.head()

Unnamed: 0,source_tweet,favorite_count,retweet_count,reply_1,reply_2,reply_3,reply_4,reply_5,reply_6,reply_7,...,reply_332,reply_333,reply_334,reply_335,reply_336,reply_337,reply_338,reply_339,reply_340,reply_341
0,There are no verses in the Quran about Me want...,4533.0,4465.0,@withnodriver no shit sherlock. are you for re...,@withnodriver @basznocz Religion doesnt have t...,@fury_jen @TheTweetOfGod blasphemer.,@runitright1 @TheTweetOfGod Sometime also Thor..,@Usairam1 mohamed liked to fuck little aisha. ...,@RStarovich @TheTweetOfGod Hahahha...you're ri...,@Usairam1 @basznocz you're kidding they're mak...,...,,,,,,,,,,
1,"Mosques, synagogues and churches are inviting ...",116.0,157.0,@elronxenu I'm surprised he hasn't been taken ...,@TeamOyeniyi Specifically: guy was on his way ...,@TeamOyeniyi So long as he doesn't kill any ho...,@elronxenu So is he just holed up until the bi...,@abcnewsSydney #sydneysiege Very disturbing ne...,@abcnewsSydney @jarrodmckenna Amen.,@elronxenu I'm inclined to think outside the s...,...,,,,,,,,,,
2,#illridewithyou: Viral Aussie solidarity amid ...,85.0,148.0,@MelonRouge I'll ride u and ur mom,@sirmomonothomo @MelonRouge \nLol\nKeep tweeti...,@sirmomonothomo @RT_com truth hurts dogs breath,@rhu71 @RT_com u look like ur waiting for a cu...,@sirmomonothomo #iwon'tridewithyou,@RT_com Will Abbot shirtfront terrorist?,@RT_com ISIS is Islam is terror,...,,,,,,,,,,
3,We continue to monitor the situation in #Sydne...,68.0,138.0,,,,,,,,...,,,,,,,,,,
4,We love the #illridewithyou initiative. Spread...,106.0,156.0,@KatSiecris we agree. It's important to rememb...,@AmnestyNSW this quote was the first thing I t...,,,,,,...,,,,,,,,,,


Cleaning data

In [36]:
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])

def cleaning (entry):
    txt = [token.lemma_ for token in doc if not token.is_stop]
    if len(txt) > 2:
        return ' '.join(txt)

brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df_true['source_tweet'])
brief_cleaning

<generator object <genexpr> at 0x7feb598e1900>

In [37]:
brief_cleaning

<generator object <genexpr> at 0x7feb598e1900>

In [13]:
t = time()
txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_process=-1)]
print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))


NameError: name 'doc' is not defined

In [38]:
# remade previous code (got somehting as txt, unused cleaning def)

t = time()
txt = nlp.pipe(brief_cleaning, batch_size=5000, n_process=-1)
print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 0.0 mins


In [39]:
df_clean = pd.DataFrame({'clean': txt})
df_clean.shape


(100, 1)

In [40]:
df_clean

Unnamed: 0,clean
0,"(there, are, no, verses, in, the, quran, about..."
1,"(mosques, synagogues, and, churches, are, invi..."
2,"( , illridewithyou, viral, aussie, solidarity,..."
3,"(we, continue, to, monitor, the, situation, in..."
4,"(we, love, the, illridewithyou, initiative, sp..."
...,...
95,"(praying, for, the, hostages, and, family, 's,..."
96,"(thinking, of, everyone, in, sydney, today, x)"
97,"(the, sydney, harbour, bridge, is, not, closed..."
98,"(breaking, news, sydneysiege, is, over, accord..."


In [28]:
from gensim.models.phrases import Phrases, Phraser

In [42]:
#sent = [row.split() for row in df_clean['clean']]
sent = [row for row in df_clean['clean']]

In [43]:
# not working
phrases = Phrases(sent, min_count=30, progress_per=10000)

[there are no verses in the quran about me wanting anyone to hold hostages in a chocolate shop in sydney you terrorist fucks ,
 mosques synagogues and churches are inviting the public to pray for sydney hostages tonight sydneysiege,
  illridewithyou viral aussie solidarity amid sydneysiege islamic state links http t co ubuxbixkdl http t co vqcuwegdqh,
 we continue to monitor the situation in sydney closely i've been in touch with juliebishopmp and expressed our thoughts and prayers ,
 we love the illridewithyou initiative spread love and compassion thoughts are with all those affected sydney http t co ek uycpkdn,
 sydney siege has now hit the hour mark live with latest summary of events http t co iht o tvtr http t co cxebiksp r,
 if you're taking selfies outside the sydneysiege hostage situation you seriously need to reevaluate your life http t co i wkmeof,
 the lebanese muslim association has issued a statement condemning sydneysiege and calling for calm http t co z hcibpaha,
 on the 

In [45]:
# related to previous
sentences = bigram[sent]


NameError: name 'bigram' is not defined

In [None]:
# most frequent words (edited) - not working, depends on previous

word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

In [None]:

# testing something

nlp = spacy.load("en_core_web_sm")
for text in nlp.pipe(iter(df_true['source_tweet']), batch_size = 1000, n_process=-1):
    for token in text:
        print(token)
    print('\n')


In [None]:
import gensim


import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [27]:
# Read comments from db

def read_from_pandas(db):
    for i in range(df_true.shape[0]):
        sentence = df_true.iloc[i]['source_tweet']
        #print(gensim.utils.simple_preprocess(sentence))
        yield gensim.utils.simple_preprocess(sentence)


In [31]:
doc = read_from_pandas(df_true)

In [32]:
print(doc)

<generator object read_from_pandas at 0x7f7b49a3e900>


In [33]:
# Build a model

model = gensim.models.Word2Vec(
    doc,
    size=150,
    window=10,
    min_count=2,
    workers=10,
    iter=10)


TypeError: __init__() got an unexpected keyword argument 'size'

In [29]:
w1 = "dirty"
model.wv.most_similar(positive=w1)

NameError: name 'model' is not defined

Converting words into vectors using word2vec

/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages
gensim installed
