# Figuring out the number of words in our dictionary

In [45]:
import tensorflow as tf
import numpy as np
import pandas as pd
import nltk
import textblob
import glob

from six.moves import urllib

import errno
import os
import zipfile

def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

## Twitter Samples
Not super useful as they are UK 2015 Election related.

In [46]:
from nltk.corpus import twitter_samples
twitter_samples.fileids()

['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']

In [47]:
twitter_samples.strings()[:5]

['hopeless for tmr :(',
 "Everything in the kids section of IKEA is so cute. Shame I'm nearly 19 in 2 months :(",
 '@Hegelbon That heart sliding into the waste basket. :(',
 '“@ketchBurning: I hate Japanese call him "bani" :( :(”\n\nMe too',
 'Dang starting next week I have "work" :(']

In [48]:
tweets = twitter_samples.tokenized()
tweet_dict = []
for i in range(len(tweets)):
    for j in range(len(tweets[i])):
        tweet_dict.append(tweets[i][j])


print(tweet_dict[:10])

['hopeless', 'for', 'tmr', ':(', 'Everything', 'in', 'the', 'kids', 'section', 'of']


In [49]:
print(len(twitter_samples.strings()))

30000


In [54]:
scraped_files = glob.glob('./datasets/water_tweets/gathered_water/*.csv')
scraped_tweets = pd.concat([pd.read_csv(f, encoding = 'ISO-8859-1', delimiter = ';') for f in scraped_files])
scraped_tweets = scraped_tweets[scraped_tweets['text'] != 'text']
scraped_tweets = scraped_tweets.drop_duplicates(subset = ['text'], keep = 'first').reset_index()
scraped_tweets = scraped_tweets[['category', 'tweet_id', 'text', 'date']]
print(len(scraped_tweets))
scraped_tweets.head()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


70175


Unnamed: 0,category,tweet_id,text,date
0,coal ash water,1.14e+18,*@noahcicero *@DaisyFried Coal ash ponds are t...,6/26/2019 3:22
1,coal ash water,1.14e+18,"""NC wants to find the source of water contamin...",6/25/2019 15:30
2,coal ash water,1.14e+18,*@atrupar Making great progress?\n you're doin...,6/25/2019 14:43
3,coal ash water,1.14e+18,"As if we needed more reasons to ditch coal, it...",6/25/2019 9:36
4,coal ash water,1.14e+18,*@shossy2 *@RodneyClaeys W/ Obama's restrictio...,6/25/2019 0:42


In [55]:
unrel_files = glob.glob('./datasets/water_tweets/rnd_tweets/*.csv')
unrel = pd.concat([pd.read_csv(f, encoding = 'ISO-8859-1', delimiter = ';') for f in unrel_files])
unrel = unrel[unrel['text'] != 'text']
unrel = unrel.drop_duplicates(subset = ['text'], keep = 'first').reset_index()
print(len(unrel))
unrel = unrel.iloc[:, :5]
unrel = unrel.iloc[:, 1:]
unrel.to_csv(r'datasets\water_tweets\unrelated_tweets.csv', sep = ';')
print(unrel.columns)
unrel.head()

83099
Index(['category', 'tweet_id', 'text', 'date'], dtype='object')


Unnamed: 0,category,tweet_id,text,date
0,the,1.152e+18,If you do the Jeep wave Iâm cutting your fin...,7/18/2019 23:38
1,the,1.152e+18,*@pokelover941 *@PoGoMaster5000 *@Kelven91 To ...,7/18/2019 23:38
2,the,1.152e+18,I actually went to the gym during the day who ...,7/18/2019 23:38
3,the,1.152e+18,*@PralineQueen1 I am quite looking forward to ...,7/18/2019 23:38
4,the,1.152e+18,*@CoryBooker Is it possible said officer was l...,7/18/2019 23:38


In [56]:
unrel_text = np.array(unrel.text.astype(str))

from textblob import TextBlob
unrel_list = []
for t in range(len(unrel_text)):
    new = TextBlob(unrel_text[t])
    unrel_list.append(new)

unrel_words = []
for i in range(len(unrel_list)):
    tweet_i = unrel_list[i]
    new_words = tweet_i.words
    unrel_words.append(new_words)
    
print(len(unrel_words))

83099


In [57]:
from itertools import chain
unrel_words = np.array(list(chain(*unrel_words)))

In [58]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\debro\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [68]:
water_text = np.array(scraped_tweets.text.astype(str))

water_list = []
for i in range(len(water_text)):
    tweet = water_text[i]
    water_list.append(tweet)
water_list[:5]

['*@noahcicero *@DaisyFried Coal ash ponds are the demonic mockery of water, but yeah, even logs can thrive and yearn in any floating hell...',
 '"NC wants to find the source of water contaminants near the Chatham coal ash landfill" #ncpol https://t.co/cAQN72okYW',
 "*@atrupar Making great progress?\n you're doing everything you can to\n Reverse water/air Standards\nThe Trump administration just revoked the rule that prevented coal companies from  dumping coal Ash in Americas streams\n\nAlong with\nMANY OTHERS\n\nhttps://t.co/VWiDvkWr8r",
 'As if we needed more reasons to ditch coal, it\'s also #radioactive. "ounce for ounce, coal ash released from a power plant delivers more #radiation than nuclear waste shielded via water or dry cask storage." #nocleancoal \nhttps://t.co/LZgh5PlHZx',
 '*@shossy2 *@RodneyClaeys W/ Obama\'s restrictions Duke Energy got away with "environmental murder" in NC.Coal ash pond emptied via pipe directly into Dan River.*@VP *@POTUS *@GOP,&amp  *@EPA,you ALL wo

In [70]:
water = []
for i in range(len(water_list)):
    tweet = str(water_list[i])
    new_tweet = TextBlob(tweet)
    water.append(new_tweet)
    
print(water[:5])

[TextBlob("*@noahcicero *@DaisyFried Coal ash ponds are the demonic mockery of water, but yeah, even logs can thrive and yearn in any floating hell..."), TextBlob(""NC wants to find the source of water contaminants near the Chatham coal ash landfill" #ncpol https://t.co/cAQN72okYW"), TextBlob("*@atrupar Making great progress?
 you're doing everything you can to
 Reverse water/air Standards
The Trump administration just revoked the rule that prevented coal companies from  dumping coal Ash in Americas streams

Along with
MANY OTHERS

https://t.co/VWiDvkWr8r"), TextBlob("As if we needed more reasons to ditch coal, it's also #radioactive. "ounce for ounce, coal ash released from a power plant delivers more #radiation than nuclear waste shielded via water or dry cask storage." #nocleancoal 
https://t.co/LZgh5PlHZx"), TextBlob("*@shossy2 *@RodneyClaeys W/ Obama's restrictions Duke Energy got away with "environmental murder" in NC.Coal ash pond emptied via pipe directly into Dan River.*@VP *@

In [72]:
rel_h2o_words = []
for i in range(len(water)):
    tweet_i = water[i]
    new_words = tweet_i.words
    rel_h2o_words.append(new_words)
    
print(rel_h2o_words[:2])

[WordList(['noahcicero', 'DaisyFried', 'Coal', 'ash', 'ponds', 'are', 'the', 'demonic', 'mockery', 'of', 'water', 'but', 'yeah', 'even', 'logs', 'can', 'thrive', 'and', 'yearn', 'in', 'any', 'floating', 'hell']), WordList(['NC', 'wants', 'to', 'find', 'the', 'source', 'of', 'water', 'contaminants', 'near', 'the', 'Chatham', 'coal', 'ash', 'landfill', 'ncpol', 'https', 't.co/cAQN72okYW'])]


In [74]:
tweet_words = np.array(tweet_dict)

In [73]:
from itertools import chain
rel_h2o_words = np.array(list(chain(*rel_h2o_words)))

In [75]:
print(len(tweet_words) + len(rel_h2o_words) + len(unrel_words))

4623892


In [43]:
complete = []
tweet_words = list(tweet_words)
rel_h2o_words = list(rel_h2o_words)
unrel_words = list(unrel_words)
#complete = list(complete.append([tweet_words, rel_h2o_words, unrel_words]))


In [77]:
complete = np.concatenate([tweet_words, rel_h2o_words, unrel_words])
unique = np.unique(complete)
print(len(unique), len(complete))

MemoryError: 