In [2]:
"""
Conventions to follow :

1) Place the downloaded tweets(JSON files) on your desktop screen inside a folder named as "tweets_json".
2) Call final_dataset_function to get the processed dataframe and dictionary with assigned sentiment values to each 
   document.

Note : The conversion of coordinate to value takes some time to execute, Have Patience :P
   
"""

'\nConventions to follow :\n\n1) Place the downloaded tweets(JSON files) on your desktop screen inside a folder named as "tweets_json".\n2) Call final_dataset_function to get the processed dataframe and dictionary with assigned sentiment values to each \n   document.\n\nNote : The conversion of coordinate to value takes some time to execute, Have Patience :P\n   \n'

In [4]:
# Installing dependencies
! pip install numpy
! pip install pandas
! pip install utm
! pip install tqdm


Requirement already up-to-date: textblob in /home/suzane/anaconda3/lib/python3.6/site-packages
Requirement already up-to-date: nltk>=3.1 in /home/suzane/anaconda3/lib/python3.6/site-packages (from textblob)
Requirement already up-to-date: six in /home/suzane/anaconda3/lib/python3.6/site-packages (from nltk>=3.1->textblob)
[nltk_data] Downloading package brown to /home/suzane/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /home/suzane/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/suzane/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/suzane/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to /home/suzane/nltk_data...
[nltk_data]   Package conll2000 is already up-to-dat

In [1]:
# Importing libraries
import glob,os
import pandas as pd
from tqdm import tqdm_notebook
import numpy as np

import matplotlib.pyplot as plt
import utm, math
from shapely.geometry import Point, Polygon

from twokenize3 import *
from sentiment import *

Adding ngram features : ngram_range 2
Add bigram sentiment scores
Add unigram sentiment scores


In [75]:
# Geospatial Grid
def generateGridNumber(df,offset,coors,xGrid):
    """ The accepts a data frame which has atleast 2 columns with names 'Latitude' and
        'Longitude'. It will be converted into UTM(Universal Transverse Mercator) co-odrinates for obtaining
        grid of a locality.
        
        input:
        Data frame with 'Latitude' and 'Longitude'
    
        output:
        Data frame with additional column which represents Grid Numbers
        
        requirements:
        import utm
        import pandas as pd
    """
    df = pd.concat([df, 
            df[['Latitude', 'Longitude']].apply(lambda r: pd.Series(dict(zip(('UTM Lat', 'UTM Long'), utm.from_latlon(r['Latitude'], r['Longitude'])[:2]))), axis=1)],
            axis=1)
    df['Lat_ind'] = ((df['UTM Lat'] -  coors['low_left_x']) / offset).astype(int)
    df['Long_ind'] = ((df['UTM Long'] -  coors['low_left_y']) / offset).astype(int)
    df['Grid Number'] = df['Long_ind'] * xGrid + df['Lat_ind']
    del df['UTM Lat']
    del df['UTM Long']
    return df

In [76]:
# Creates hdf5 documents on the basis of c2v values
def makeDocuments(c2v_values,dataset):
    desktop_path = os.path.join(os.path.join(os.path.expanduser('~')), 'Desktop')
    os.chdir(desktop_path)
    directory = "hdf5_Doc"
    if not os.path.exists(directory):
        os.makedirs(directory)
    os.chdir(desktop_path + "/" + directory)
    for values in c2v_values:
        doc = dataset[dataset["Grid Number"] == values]
        filename = str(values) + ".hdf5"
        doc.reset_index(drop=True)
        doc.to_hdf(filename,'key')

In [77]:
# Processing dataframe
def create_dataframe():
    desktop_path = os.path.join(os.path.join(os.path.expanduser('~')), 'Desktop')
    tweets_path = desktop_path + "/tweets_json/*.json"
    tweets = pd.concat(map(pd.read_json, tqdm_notebook(glob.glob(tweets_path))))
    return tweets

# Removing the null values from the input dataframe
def filter_dataframe(data):
    # coordinates = longitude
    # geo = lattitude
    headers = ["geo", "text","lang","timestamp_ms"]
    df = data[headers]
    for columns in headers:
        df = df[df[columns].notnull()]
    # Considering the tweets only in english language
    return df[df["lang"] == "en"].reset_index(drop=True)

# Converts the coordinates to their respective grid index
def c2v(dataset,offset,coors,xGrid):
    """ Below are the set of constants that are being used in the Project includes :
    - offset for generating the grid
    - lower left and upper right co-ordinates of a locality(in this case, Chicago)
    """
    latitude,longitude = ([] for i in range(2))
    for values in dataset["geo"]:
        latitude.append(values["coordinates"][0])
        longitude.append(values["coordinates"][1])
    dataset["Latitude"] = latitude
    dataset["Longitude"] = longitude
    dataset = generateGridNumber(dataset,offset,coors,xGrid)
    del dataset['geo']
    return dataset

In [78]:
# Initialising dataframe
def initialise_dataframe():
    df = create_dataframe()
    df = filter_dataframe(df)
    return df

In [79]:
# Process the dataframe to get sentiment and tokenized values
def process_dataframe(dataset,offset,coors,xGrid):
    myDic = {}
    tweetList = []
    dataset = c2v(dataset,offset,coors,xGrid)
    dataset["sentiment_text"] = dataset["text"].apply(findSentimentTweet)
    dataset["twokenized_text"] = dataset["text"].apply(tokenizeRawTweetText)
    dataset.dropna(inplace=True)
    dataset.sort_values(['Grid Number'],inplace=True)
    dataset["Grid Number"] = dataset["Grid Number"].astype(int)
    c2v_list = dataset["Grid Number"].unique()
    makeDocuments(c2v_list,dataset)
    desktop_path = os.path.join(os.path.join(os.path.expanduser('~')), 'Desktop')
    directory = desktop_path + "/" + "hdf5_Doc"
    for filename in os.listdir(os.getcwd()):
        if filename.endswith(".hdf5"):
            temp = []
            doc = pd.read_hdf(filename)
            for values in doc["text"]:
                temp.append(values)
            tweetList.append(temp)
            myDic[str(filename)] = findSentimentDoc(tweetList[-1])
        else:
            pass
    return dataset,myDic

In [80]:
def final_dataset_function():
    offset = 1000
    coors = {'low_left_x' : 421710.112401581, 'low_left_y' : 4610737.961457818, 'up_right_x' : 456608.39121255605, 'up_right_y' : 4652466.087380382}
    xGrid = int((coors['up_right_x'] - coors['low_left_x']) / offset) + 1
    df = initialise_dataframe()
    df,myDict = process_dataframe(df,offset,coors,xGrid)
    return df,myDict

In [81]:
df,myDict = final_dataset_function()

A Jupyter Widget




your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block3_values] [items->['text', 'lang', 'twokenized_text']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


In [82]:
df

Unnamed: 0,text,lang,timestamp_ms,Latitude,Longitude,Lat_ind,Long_ind,Grid Number,sentiment_text,twokenized_text
4598,Join the Dollar General team! See our latest #...,en,2017-12-09 19:13:49.955,37.114218,-89.197853,-116,-500,-17616,-0.122362,"[Join, the, Dollar, General, team, !, See, our..."
629,We're #hiring! Read about our latest #job open...,en,2017-12-09 14:32:35.840,37.114218,-89.197853,-116,-500,-17616,-0.108426,"[We're, #hiring, !, Read, about, our, latest, ..."
10541,Podcast-thirty. #Fandom #Nonsense #IndiFunFun ...,en,2017-12-11 06:03:49.240,37.129170,-88.579400,-62,-500,-17562,-0.303387,"[Podcas, t-t, hirty, ., #Fandom, #Nonsense, #I..."
9049,I'm at Kentucky / Illinois State Line in Metro...,en,2017-12-10 20:50:56.114,37.138425,-88.685288,-71,-499,-17536,-0.403729,"[I'm, at, Kentucky, /, Illinois, State, Line, ..."
1845,"Coffee, cinnamon and other spices - Drinking a...",en,2017-12-11 04:16:17.697,37.416100,-88.893000,-89,-468,-16469,0.145859,"[Coffee, ,, cinnamon, and, other, spices, -, D..."
7113,We're #hiring! Click to apply: Driver Helper -...,en,2017-12-09 15:28:18.578,37.415329,-88.897843,-89,-468,-16469,-0.301215,"[We're, #hiring, !, Click, to, apply, :, Drive..."
10274,We're #hiring! Click to apply: STORE MANAGER C...,en,2017-12-08 21:09:03.632,37.423663,-88.346148,-40,-468,-16420,-0.358395,"[We're, #hiring, !, Click, to, apply, :, STORE..."
7981,Southern Illinois is home to many talented art...,en,2017-12-09 18:32:29.434,37.530251,-89.166301,-113,-454,-16003,0.520869,"[Southern, Illinois, is, home, to, many, talen..."
9075,"“Then Jesus said to his disciples, “Whoever wa...",en,2017-12-10 20:54:30.944,37.551252,-89.347414,-129,-452,-15949,-0.651474,"[“, Then, Jesus, said, to, his, disciples, ,, ..."
9744,Today is a good day to be on the trail... #hik...,en,2017-12-10 18:59:24.325,37.686400,-89.398400,-133,-437,-15428,0.054186,"[Today, is, a, good, day, to, be, on, the, tra..."


In [83]:
myDict

{'-4634.hdf5': -0.26249980861202715,
 '677.hdf5': -0.18666851006390472,
 '4351.hdf5': -0.30087027279201334,
 '-8131.hdf5': 0.129853444488363,
 '1456.hdf5': -0.04883075185109472,
 '266.hdf5': -0.2563355112872067,
 '1942.hdf5': -0.13125558198805498,
 '7209.hdf5': -0.03678977554634022,
 '5211.hdf5': -0.2310211366793547,
 '-4497.hdf5': -0.09331485135666129,
 '395.hdf5': -0.2148811368905758,
 '16678.hdf5': -0.208569183684026,
 '-12750.hdf5': -0.025823773761511182,
 '-2375.hdf5': -0.1281545425719685,
 '-2659.hdf5': -0.2005063931682622,
 '2252.hdf5': -0.16266470972207175,
 '1074.hdf5': -0.2077701613580235,
 '2788.hdf5': 0.06790083101944933,
 '1093.hdf5': -0.21592609142030517,
 '-9576.hdf5': 0.10730668949931688,
 '100.hdf5': -0.053616199546088206,
 '959.hdf5': -0.14776525187596223,
 '554.hdf5': -0.09070108158729363,
 '10832.hdf5': -0.01393651758141945,
 '900.hdf5': -0.09230347788793798,
 '1376.hdf5': 0.1917244720479001,
 '-612.hdf5': -0.054563606791434405,
 '1022.hdf5': -0.15714988306397654,
 