# Answers to coding questions in deep learning
Peeter Niidas

In [57]:
# Make some preparations

import glob
import json
import pandas as pd
import numpy as np
import re
import spacy
from gensim.models import Word2Vec
import os
import sys
from time import time

import matplotlib.pyplot as plt
#%tensorflow_version 1.x

import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

1. Download either the PHEME5 or PHEME9 dataset.
2. Choose any one event from the dataset and load its first 100 JSON files into
python memory.
3. Extract and load attributes such as source tweets, reply tweets, and favorites
count, labels of the source tweets.

In [58]:
# Get filenames and load data

def read_rename_data (dirr, length):
    path = dirr
    df = pd.DataFrame()
    for filename in glob.iglob(path + '**/*.json', recursive=True):
        if 'source-tweet' in filename:
            i = 1
            f = open(filename)
            data = json.load(f)
            new_row = {'source_tweet': data['text'], 'favorite_count': data['favorite_count'], 'retweet_count':data['retweet_count']}
            df = df.append(new_row, ignore_index=True)
            f.close()
        elif 'reactions' in filename:
            f = open(filename)
            data = json.load(f)
            new_name = 'reply_' + str(i)
            df.loc[df.index[-1], new_name] = data['text']
            i += 1
        if df.shape[0] == length + 1:
            df.drop(df.tail(1).index,inplace=True)
            return df
    return df

In [116]:
df_true = read_rename_data('./data/pheme-rnr-dataset/sydneysiege/non-rumours/', 2)
df_true.shape

(2, 73)

In [60]:
df_rumors = read_rename_data('./data/pheme-rnr-dataset/sydneysiege/rumours/', 100)
df_rumors.shape

(100, 94)

In [110]:
df_true.head()

Unnamed: 0,source_tweet,favorite_count,retweet_count,reply_1,reply_2,reply_3,reply_4,reply_5,reply_6,reply_7,...,reply_61,reply_62,reply_63,reply_64,reply_65,reply_66,reply_67,reply_68,reply_69,reply_70
0,There are no verses in the Quran about Me want...,4533.0,4465.0,@withnodriver no shit sherlock. are you for re...,@withnodriver @basznocz Religion doesnt have t...,@fury_jen @TheTweetOfGod blasphemer.,@runitright1 @TheTweetOfGod Sometime also Thor..,@Usairam1 mohamed liked to fuck little aisha. ...,@RStarovich @TheTweetOfGod Hahahha...you're ri...,@Usairam1 @basznocz you're kidding they're mak...,...,@nselby @TheTweetOfGod no one has guns? It sa...,"@basznocz islam is a lie, just a suppressive m...",@withnodriver @basznocz Haha. All the science ...,@cIeopatrio @TheTweetOfGod you know it was a j...,@timpoliti @runitright1 @TheTweetOfGod Allah t...,@fury_jen @TheTweetOfGod his voicemail is ful...,@withnodriver @basznocz Were WW1 WW2 and many ...,@basznocz @TheTweetOfGod religion is a complet...,@weisblumen @IncredulousMark @withnodriver @ba...,@TheTweetOfGod sorry god but you're wrong. hav...
1,"Mosques, synagogues and churches are inviting ...",116.0,157.0,@elronxenu I'm surprised he hasn't been taken ...,@TeamOyeniyi Specifically: guy was on his way ...,@TeamOyeniyi So long as he doesn't kill any ho...,@elronxenu So is he just holed up until the bi...,@abcnewsSydney #sydneysiege Very disturbing ne...,@abcnewsSydney @jarrodmckenna Amen.,@elronxenu I'm inclined to think outside the s...,...,,,,,,,,,,


Data cleaning

In [101]:
# remove hashtags and mentions of other users, lowercase everything, remove punctuation and stopwords

import re
import string
from nltk.corpus import stopwords


[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1124)>


In [114]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


invalid command name "exit"
    while executing
"exit"


True

In [111]:
stop = stopwords.words('english')

def data_cleaning(x):
    exclusion_list = ['#', '@']
    x = str(x).lower()
    for char in string.punctuation:
        x = x.replace(char, '')
    x = x.replace("\n", " ")
    words = str(x).split()
    cleaned = [word for word in words if all(ch not in word for ch in exclusion_list)]
    cleaned_stopwords = [word for word in cleaned if word not in stop]
    return ' '.join(cleaned_stopwords)


In [112]:
df_true = df_true.applymap(data_cleaning)

In [113]:
df_true

Unnamed: 0,source_tweet,favorite_count,retweet_count,reply_1,reply_2,reply_3,reply_4,reply_5,reply_6,reply_7,...,reply_61,reply_62,reply_63,reply_64,reply_65,reply_66,reply_67,reply_68,reply_69,reply_70
0,verses quran wanting anyone hold hostages choc...,45330,44650,withnodriver shit sherlock real said decades t...,withnodriver basznocz religion doesnt anything...,furyjen thetweetofgod blasphemer,runitright1 thetweetofgod sometime also thor,usairam1 mohamed liked fuck little aisha liked...,rstarovich thetweetofgod hahahhayoure right pl...,usairam1 basznocz youre kidding theyre making ...,...,nselby thetweetofgod one guns says hes armed,basznocz islam lie suppressive measure populat...,withnodriver basznocz haha science rn due sinc...,cieopatrio thetweetofgod know joke right,timpoliti runitright1 thetweetofgod allah tras...,furyjen thetweetofgod voicemail full please tr...,withnodriver basznocz ww1 ww2 many wars killed...,basznocz thetweetofgod religion complete fallacy,weisblumen incredulousmark withnodriver baszno...,thetweetofgod sorry god youre wrong havent rea...
1,mosques synagogues churches inviting public pr...,1160,1570,elronxenu im surprised hasnt taken hostages ou...,teamoyeniyi specifically guy way bigger event ...,teamoyeniyi long doesnt kill hostages excellen...,elronxenu holed bigger event happens,abcnewssydney sydneysiege disturbing news deep...,abcnewssydney jarrodmckenna amen,elronxenu im inclined think outside square one...,...,,,,,,,,,,


In [115]:
# Cleaning with spaCy
import spacy

In [117]:
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])

def cleaning (entry):
    txt = [token.lemma_ for token in doc if not token.is_stop]
    if len(txt) > 2:
        return ' '.join(txt)

brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df_true['source_tweet'])

<generator object <genexpr> at 0x7fcf9a3096d0>

In [118]:
txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_process=-1)]


NameError: name 'doc' is not defined

In [None]:
# Tokenize everything

from nltk.tokenize import sent_tokenize, word_tokenize

import warnings
warnings.filterwarnings(action = 'ignore')
import gensim
from gensim.models import Word2Vec

In [96]:
# Creating database of main posts and comments (all text)
output = ''

i = 1
for i in df_true:
    data = df_true[i]
    #print(data[1])
    #data = np.array(data)
    for j in data:
        print(data(j))

mosques synagogues and churches are inviting the public to pray for sydney hostages tonightsydneysiege


TypeError: 'Series' object is not callable

In [97]:
output

''

In [85]:
sent = [row for row in df_true['source_tweet']]

In [86]:
sent

['there are no verses in the quran about me wanting anyone to hold hostages in a chocolate shop in sydney you terrorist fucks',
 'mosques synagogues and churches are inviting the public to pray for sydney hostages tonightsydneysiege']

In [None]:
def tokenise(x):
    data = []
    for i in sent_tokenize(x):
        temp =  []
        for j in word_tokenize(i):
            temp.append(j)
        data.append(temp)
    return data

In [84]:
df_true.applymap(tokenise)

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/PY3/english.pickle[0m

  Searched in:
    - '/Users/peeter/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.8/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.8/share/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.8/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************
