In [18]:
"""
Conventions to follow :

1) Place the downloaded tweets(JSON files) on your desktop screen inside a folder named as "tweets_json".
2) Call final_dataset_function to get the processed dataframe and dictionary with assigned sentiment values to each 
   document and LSA dictionary with their associated topics.

Note : The execution of the program takes some time(like 15 min approximately), have patience.
   
"""

'\nConventions to follow :\n\n1) Place the downloaded tweets(JSON files) on your desktop screen inside a folder named as "tweets_json".\n2) Call final_dataset_function to get the processed dataframe and dictionary with assigned sentiment values to each \n   document and LSA dictionary with their associated topics.\n\nNote : The conversion of coordinate to value takes some time to execute, Have Patience :P\n   \n'

In [2]:
# Installing dependencies
! pip install numpy
! pip install pandas
! pip install utm
! pip install tqdm
! pip install shapely
! pip install sklearn



In [3]:
# Importing libraries
import glob,os
import pandas as pd
from tqdm import tqdm_notebook
import numpy as np

import matplotlib.pyplot as plt
import utm, math
from shapely.geometry import Point, Polygon

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from twokenize3 import *
from sentiment import *
from collections import defaultdict

Adding ngram features : ngram_range 2
Add bigram sentiment scores
Add unigram sentiment scores


In [4]:
# Geospatial Grid
def generateGridNumber(df,offset,coors,xGrid):
    """ The accepts a data frame which has atleast 2 columns with names 'Latitude' and
        'Longitude'. It will be converted into UTM(Universal Transverse Mercator) co-odrinates for obtaining
        grid of a locality.
        
        input:
        Data frame with 'Latitude' and 'Longitude'
    
        output:
        Data frame with additional column which represents Grid Numbers
        
        requirements:
        import utm
        import pandas as pd
    """
    df = pd.concat([df, 
            df[['Latitude', 'Longitude']].apply(lambda r: pd.Series(dict(zip(('UTM Lat', 'UTM Long'), utm.from_latlon(r['Latitude'], r['Longitude'])[:2]))), axis=1)],
            axis=1)
    df['Lat_ind'] = ((df['UTM Lat'] -  coors['low_left_x']) / offset).astype(int)
    df['Long_ind'] = ((df['UTM Long'] -  coors['low_left_y']) / offset).astype(int)
    df['Grid Number'] = df['Long_ind'] * xGrid + df['Lat_ind']
    del df['UTM Lat']
    del df['UTM Long']
    return df

In [19]:
# Creates hdf5 documents on the basis of c2v values
def makeDocuments(c2v_values,dataset):
    desktop_path = os.path.join(os.path.join(os.path.expanduser('~')), 'Desktop')
    os.chdir(desktop_path)
    directory = "hdf5_Doc"
    if not os.path.exists(directory):
        os.makedirs(directory)
    os.chdir(desktop_path + "/" + directory)
    for values in c2v_values:
        doc = dataset[dataset["Grid Number"] == values]
        filename = str(values) + ".hdf5"
        doc.reset_index(drop=True)
        doc.to_hdf(filename,'key')

In [20]:
# Processing dataframe
def create_dataframe():
    desktop_path = os.path.join(os.path.join(os.path.expanduser('~')), 'Desktop')
    tweets_path = desktop_path + "/tweets_json/*.json"
    tweets = pd.concat(map(pd.read_json, tqdm_notebook(glob.glob(tweets_path))))
    return tweets

# Removing the null values from the input dataframe
def filter_dataframe(data):
    # coordinates = longitude
    # geo = lattitude
    headers = ["geo", "text","lang","timestamp_ms"]
    df = data[headers]
    for columns in headers:
        df = df[df[columns].notnull()]
    # Considering the tweets only in english language
    return df[df["lang"] == "en"].reset_index(drop=True)

# Converts the coordinates to their respective grid index
def c2v(dataset,offset,coors,xGrid):
    """ Below are the set of constants that are being used in the Project includes :
    - offset for generating the grid
    - lower left and upper right co-ordinates of a locality(in this case, Chicago)
    """
    latitude,longitude = ([] for i in range(2))
    for values in dataset["geo"]:
        latitude.append(values["coordinates"][0])
        longitude.append(values["coordinates"][1])
    dataset["Latitude"] = latitude
    dataset["Longitude"] = longitude
    dataset = generateGridNumber(dataset,offset,coors,xGrid)
    del dataset['geo']
    return dataset

In [21]:
# Initialising dataframe
def initialise_dataframe():
    df = create_dataframe()
    df = filter_dataframe(df)
    return df

In [22]:
# Process the dataframe to get sentiment and tokenized values
def process_dataframe(dataset,offset,coors,xGrid):
    myDic1,myDic2 = ({} for i in range(2))
    tweetList = []
    dataset = c2v(dataset,offset,coors,xGrid)
    dataset["sentiment_text"] = dataset["text"].apply(findSentimentTweet)
    dataset["twokenized_text"] = dataset["text"].apply(tokenizeRawTweetText)
    dataset.dropna(inplace=True)
    dataset.sort_values(['Grid Number'],inplace=True)
    dataset["Grid Number"] = dataset["Grid Number"].astype(int)
    c2v_list = dataset["Grid Number"].unique()
    makeDocuments(c2v_list,dataset)
    desktop_path = os.path.join(os.path.join(os.path.expanduser('~')), 'Desktop')
    directory = desktop_path + "/" + "hdf5_Doc"
    for filename in os.listdir(os.getcwd()):
        if filename.endswith(".hdf5"):
            temp = []
            doc = pd.read_hdf(filename)
            for values in doc["text"]:
                temp.append(values)
            tweetList.append(temp)
            myDic1[str(filename)] = findSentimentDoc(tweetList[-1])
            myDic2[str(filename)] = tweetList[-1]
        else:
            pass
    return dataset,myDic1,myDic2,tweetList

# Vectorization of the documents
def vectorizer(document,vectorizer):
    matrix = vectorizer.fit_transform(document)
    return matrix.shape[0], matrix, vectorizer

# Decompostion of the matrix
def decomposition(shape,matrix,vectorizer,dic_keys,lsa_dict):
    lsa = TruncatedSVD(n_components=shape, n_iter=100)
    lsa.fit(matrix)
    terms = vectorizer.get_feature_names()
    for i,comp in enumerate(lsa.components_):
        termList = []
        termsInComp = zip(terms,comp)
        sortedterms = sorted(termsInComp, key=lambda x: x[1],reverse=True)[:10]
        mykey = "Topic :" + str(i)
        for term in sortedterms:
            termList.append(term[0])
        lsa_dict[mykey] = termList
    return lsa_dict

In [23]:
def final_dataset_function():
    decompostion_dict,lsa,lsa_dict = ({} for i in range(3))
    offset = 1000
    coors = {'low_left_x' : 421710.112401581, 'low_left_y' : 4610737.961457818, 'up_right_x' : 456608.39121255605, 'up_right_y' : 4652466.087380382}
    xGrid = int((coors['up_right_x'] - coors['low_left_x']) / offset) + 1
    df = initialise_dataframe()
    df,doc_sentiment,myDict2, textData = process_dataframe(df,offset,coors,xGrid)
    my_vectorizer = TfidfVectorizer(use_idf=True,ngram_range=(1,3))
    for keys,values in myDict2.items():
        shape,matrix,my_vectorizer = vectorizer(values,my_vectorizer)
        myDict = decomposition(shape,matrix,my_vectorizer,keys,lsa_dict)
        lsa[keys] = myDict
    return df,doc_sentiment,lsa

In [24]:
df,doc_sentiment,lsa = final_dataset_function()

A Jupyter Widget




your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block3_values] [items->['text', 'lang', 'twokenized_text']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)
  self.explained_variance_ratio_ = exp_var / full_var
  self.explained_variance_ratio_ = exp_var / full_var


In [26]:
df.head()

Unnamed: 0,text,lang,timestamp_ms,Latitude,Longitude,Lat_ind,Long_ind,Grid Number,sentiment_text,twokenized_text
8020,We're #hiring! Read about our latest #job open...,en,2017-12-09 14:32:35.840,37.114218,-89.197853,-116,-500,-17616,-0.108444,"[We're, #hiring, !, Read, about, our, latest, ..."
10085,Join the Dollar General team! See our latest #...,en,2017-12-09 19:13:49.955,37.114218,-89.197853,-116,-500,-17616,-0.122382,"[Join, the, Dollar, General, team, !, See, our..."
5493,Podcast-thirty. #Fandom #Nonsense #IndiFunFun ...,en,2017-12-11 06:03:49.240,37.12917,-88.5794,-62,-500,-17562,-0.303462,"[Podcas, t-t, hirty, ., #Fandom, #Nonsense, #I..."
9632,I'm at Kentucky / Illinois State Line in Metro...,en,2017-12-10 20:50:56.114,37.138425,-88.685288,-71,-499,-17536,-0.403814,"[I'm, at, Kentucky, /, Illinois, State, Line, ..."
7411,We're #hiring! Click to apply: Driver Helper -...,en,2017-12-09 15:28:18.578,37.415329,-88.897843,-89,-468,-16469,-0.301291,"[We're, #hiring, !, Click, to, apply, :, Drive..."


In [27]:
doc_sentiment

{'10832.hdf5': -0.013969854240542257,
 '1676.hdf5': -0.19104823264228082,
 '1204.hdf5': 0.05497064363721469,
 '325.hdf5': 0.075054097059950975,
 '3142.hdf5': 0.025864313249863935,
 '864.hdf5': -0.11127063215641296,
 '-97.hdf5': -0.16332415203310222,
 '687.hdf5': -0.18062573917755084,
 '17264.hdf5': -0.4326687542486321,
 '13960.hdf5': -0.010754976848506756,
 '3891.hdf5': -0.2887065286885071,
 '-3395.hdf5': -0.3285225684746958,
 '7161.hdf5': 0.28615701875719668,
 '706.hdf5': 0.021870149091158142,
 '1646.hdf5': -0.10628517055672981,
 '450.hdf5': -0.19111878923980513,
 '187.hdf5': -0.11140042582561338,
 '292.hdf5': -0.10499716952986332,
 '1307.hdf5': -0.16365955476177582,
 '-108.hdf5': 0.055677252831532131,
 '459.hdf5': 0.0081054310891230941,
 '4280.hdf5': -0.19924658225006636,
 '-7779.hdf5': -0.18947044523087481,
 '5442.hdf5': 0.099845731312730779,
 '1370.hdf5': -0.24641897381791877,
 '7930.hdf5': -0.066092712486088043,
 '-11432.hdf5': 0.15894090954247964,
 '574.hdf5': -0.1549387270090587

In [None]:
for values in lsa:
    print(values)