In [2]:
import json
import pandas as pd
import scipy as sp
import scipy.stats as stats
import matplotlib.pyplot as plt
import re
import multiprocessing
import numpy as np
import shutil 
import preprocessor as p
import pickle
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS 

# Set of stopwords from Stone, Denis, Kwantes
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')
from nltk import classify
from nltk import NaiveBayesClassifier

from textblob import TextBlob
import itertools 

import pickle

from sklearn.linear_model import LinearRegression
import pandas_profiling

import statsmodels.api as sm

from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator


%matplotlib inline
plt.style.use('ggplot')

pd.set_option('display.max_columns', 500)
pd.options.display.max_rows = 500

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/vinhtran/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
def word_in_text(word, text):
    word = word.lower()
    text = text.lower()
    match = re.search(word, text)
    if match:
        return True
    return False

def keyword_column_boolean(df, keyword_list):
    for x in keyword_list:
        df[x] = df['text'].apply(lambda text: word_in_text(x,text))
# sentiment analysis
def get_tweet_polarity(tweet):
        '''
        Utility function to classify sentiment of passed tweet
        using textblob's sentiment method
        '''
        # create TextBlob object of passed tweet text
        analysis = TextBlob(tweet)
        # set sentiment
        return analysis.sentiment.polarity
    

def get_tweet_sentiment(polarity):
        if polarity > 0:
            return 'positive'
        elif polarity == 0:
            return 'neutral'
        else:
            return 'negative'
def load_US_coord_dict():
    '''
    Input: n/a
    Output: A dictionary whose keys are the location names ('City, State') of the
    378 US classification locations and the values are the centroids for those locations
    (latitude, longittude)
    '''

    pkl_file = open("GeoData/US_coord_dict.pkl", 'rb')
    US_coord_dict = pickle.load(pkl_file)
    pkl_file.close()
    return US_coord_dict

def find_dist_between(tup1, tup2):
    '''
    INPUT: Two tuples of latitude, longitude coordinates pairs for two cities
    OUTPUT: The distance between the cities
    '''

    return np.sqrt((tup1[0] - tup2[0])**2 + (tup1[1] - tup2[1])**2)

def closest_major_city(tup):
    '''
    INPUT: A tuple of the centroid coordinates for the tweet to remap to the closest major city
    OUTPUT: String, 'City, State', of the city in the dictionary 'coord_dict' that is closest to the input city
    '''

    d={}
    for key, value in US_coord_dict.items():
        dist = find_dist_between(tup, value)
        if key not in d:
            d[key] = dist
    return min(d, key=d.get)

def get_closest_major_city_for_US(row):
    '''
    Helper function to return the closest major city for US users only. For users
    outside the US it returns 'NOT_IN_US, NONE'
    '''
    return closest_major_city(row['coordinate_point'])

### Pre-processing

In [49]:
#Combine all files into one

with open('data/output_file_weekend.txt','wb') as wfd:
    for f in [
               'data/output_file.txt',         
               'data/twitter_data_c3_pt1.txt',
               'data/twitter_data_c3_pt2.txt', 
               'data/twitter_data_c3_pt3.txt', 
               'data/twitter_data_c3_pt4.txt', 
               'data/twitter_data_c3_pt5.txt', 
               'data/twitter_data_c3_pt6.txt', 
               'data/twitter_data_c3_pt7.txt',
               'data/twitter_data_c3_pt8.txt'
             ]:
        with open(f,'rb') as fd:
            shutil.copyfileobj(fd, wfd)

### Read in and subset data

In [5]:
#read in data as chunks
df = pd.read_json('data/twitter/output_file_weekend.txt', lines = True)

In [51]:
#only keep variables needed
df = df[["id_str", "text", "place", "coordinates", "created_at", "lang", "possibly_sensitive","retweeted" ]]

In [52]:
#don't include retweets
keyword_column_boolean(df, ['RT'])
print("full df:", len(df))
df = df[df['RT']==False]
print("after RT removal:", len(df))

full df: 202249
after RT removal: 111626


In [53]:
#booleans for each diet
diet_list = ['keto','whole30','gluten','mediterranean','lowfat', 'atkins', 'paleo', 'celeryjuice']
keyword_column_boolean(df, diet_list)

In [54]:
#fix Place field
#fill None with 0
filled = df['place'].fillna(0)
#replace old column
df["place"] = filled

In [55]:
# #mask
place = df[df['place'] != 0]
place.reset_index(inplace=True)

#pull out bounding box from place
df_place = [i for i in df["place"] if i] 
df_place_2 = pd.DataFrame(list(np.array(df_place)))

city = df_place_2[["name","country_code"]]

bounding_box = pd.DataFrame(list(np.array(df_place_2["bounding_box"])))
bounding_box.rename(columns={'coordinates': 'bounding_box'}, inplace=True)

#add to original df to get df we want
df_location = pd.concat([place, bounding_box], axis=1, join='inner')
df_location = pd.concat([df_location, city], axis=1, join='inner')
print("has location data:", len(df_location))

has location data: 5429


In [56]:
#limit to only english for analysis
df_location = df_location[df_location['lang'] == 'en']
print("after english removal and with df_location:", len(df_location))
df_eng = df[df['lang'] == 'en']
print("after english removal df_eng:", len(df))

after english removal df_location: 4891
after english removal df_eng: 111626


In [57]:
#limit to only US for analysis
df_location = df_location[df_location['country_code'] == 'US']
print("after non-US removal:", len(df_location))

after non-US removal: 3269


In [58]:
#clean tweets - remove URLs, smileys, mentions, emojis

p.set_options(p.OPT.URL, p.OPT.SMILEY, p.OPT.MENTION, p.OPT.EMOJI) 


text_list = list(df_location["text"])

clean_text_list = []
for tweet in text_list:   
    clean_text_list.append(p.clean(tweet))

df_location["text_clean"] = clean_text_list 

In [59]:
#manually delete for now. eventually would like to do a hieararchy where if place is missing 
# and coordinates is not, fill with coordinates
df_location = df_location[df_location["id_str"] != 1141697585700204544]

#fix index
df_location2 = df_location.reset_index()
df_location3 = df_location2.drop(["index", "level_0"], axis = 1)
df_location3['index'] = df_location3.index

#final df
df_clean = df_location3

### create dataframe with clean text and index for merging


In [60]:
# create a new frame with only the cleaned text (tweet) and index
documents_v = df_clean[['text_clean', "index"]]

print(documents_v[:10])

                                          text_clean  index
0  you should make Keto meal versions of food you...      0
1  Today is a good day #sanantonio #glutenfree #h...      1
2  Have you tried our new Classic Cheeseburger? O...      2
3  I love when I walk into a grocery store for th...      3
4  Sausage and Black Olive pizzas in and do Antip...      4
5  I just had a customer tell me that the keto di...      5
6  This just made me laugh! Keto friends know. Th...      6
7  Perfect sunny Seattle day at my favorite brewe...      7
8  Enjoying a Grapefruit IPA on a perfect sunny S...      8
9  It's a hot trend now smh. They'd keto water an...      9


### NLP SENTIMENT ANALYSIS

In [61]:
# using TextBlob calculate polarity and sentiment on clean tweets
documents_v['polarity'] = documents_v['text_clean'].map(get_tweet_polarity);
documents_v['sentiment'] = documents_v['polarity'].map(get_tweet_sentiment);

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


### Merge to cities data

In [62]:
#to get coordinates from place

list2 = [item[0] for item in df_clean["bounding_box"]]
list3 = [item[0] for item in list2]
list4 = [item[::-1] for item in list3]

df_clean["coordinate_point"] = list4

In [63]:
if __name__ == "__main__":

    # Load US_coord_dict
    US_coord_dict = load_US_coord_dict()

    # Create a new column called 'closest_major_city'
    df_clean['closest_major_city'] = df_clean.apply(lambda row: get_closest_major_city_for_US(row), axis=1)
    

In [76]:
prediction = pd.concat([df_clean, documents_v], axis=1)

# Read in CDC data with obesity rates

In [174]:
# read in CDC data
cities = pd.read_csv('data/500_Cities__City-level_Data__GIS_Friendly_Format___2018_release.csv')


In [1]:
#bin CDC obesity rates for classifier
#putting into 3 bins
cities["OBESITY_cut"] = pd.qcut(cities["OBESITY_AdjPrev"],3, labels = ["low", "medium", 'high'])

NameError: name 'pd' is not defined

### work on sentiment dataframe before merging to CDC data

In [176]:
#aggregate for not binned data by closest_major_city
prediction2 = prediction.groupby(['closest_major_city','sentiment'])['polarity'].mean()

In [166]:
#look at the count by bin
count_by_bin = prediction_cut2.groupby(['OBESITY_cut','sentiment'])['polarity'].agg(['count','mean']).reset_index()
count_by_bin.pivot_table(index='OBESITY_cut', columns="sentiment", values=('mean', 'count'))

Unnamed: 0_level_0,count,count,count,mean,mean,mean
sentiment,negative,neutral,positive,negative,neutral,positive
OBESITY_cut,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
low,127,386,484,-0.273935,0.0,0.414024
medium,155,385,540,-0.266884,0.0,0.411987
high,161,328,675,-0.29634,0.0,0.459977


In [177]:
#look at the count by city
count_by_city = prediction.groupby(['closest_major_city','sentiment'])['polarity'].agg(['count','mean']).reset_index()
count_by_city.pivot_table(index='closest_major_city', columns="sentiment", values=('mean', 'count'))

Unnamed: 0_level_0,count,count,count,mean,mean,mean
sentiment,negative,neutral,positive,negative,neutral,positive
closest_major_city,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
"Abilene, TX",,1.0,2.0,,0.0,0.28
"Akron, OH",2.0,6.0,3.0,-0.440625,0.0,0.351667
"Albuquerque, NM",1.0,4.0,2.0,-0.1875,0.0,0.6875
"Alexandria, VA",5.0,22.0,15.0,-0.155,0.0,0.345094
"Alhambra, CA",2.0,4.0,1.0,-0.15,0.0,0.6
"Allentown, PA",1.0,4.0,7.0,-0.21,0.0,0.360763
"Amarillo, TX",,4.0,8.0,,0.0,0.53875
"Anaheim, CA",,,2.0,,,0.9
"Anchorage, AK",2.0,3.0,1.0,-0.177778,0.0,0.85
"Ann Arbor, MI",2.0,5.0,7.0,-0.4625,0.0,0.441964


In [178]:
prediction2 = pd.DataFrame(prediction2)
prediction3 = prediction2.pivot_table(index='closest_major_city', columns="sentiment", values='polarity')
prediction3 =  prediction3.rename_axis(None, axis=1).reset_index() 
prediction4 = prediction3.rename(columns = {"index": 'PlaceName'})

#pull out city and state into separate columns
city = prediction4["closest_major_city"].str.split(',', expand=True)
city.rename(columns = {0:"closest_city", 1: "closest_state"}, inplace=True)
prediction5 = pd.concat([city, prediction4], axis=1, join='inner')
#remove spaces for merging
prediction5["closest_state"] = prediction5["closest_state"].str.strip()


In [179]:
#merge to cities CDC data
prediction6 = pd.merge(prediction5,cities,  
                       left_on=('closest_city','closest_state'), 
                       right_on=('PlaceName','StateAbbr'), 
                       indicator = True)

In [70]:
# prediction5["closest_city"] = prediction5["closest_city"].str.encode('utf-8')
# prediction5["closest_state"] = prediction5["closest_state"].str.encode('utf-8')
# cities["PlaceName"] = cities["PlaceName"].str.encode('utf-8')
# cities["StateAbbr"] = cities["StateAbbr"].str.encode('utf-8')
# prediction5["closest_city"]


In [180]:
#check merge with Springfield
prediction6[prediction6["closest_city"] == "Springfield"]

Unnamed: 0,closest_city,closest_state,closest_major_city,negative,neutral,positive,StateAbbr,PlaceName,PlaceFIPS,Population2010,ACCESS2_CrudePrev,ACCESS2_Crude95CI,ACCESS2_AdjPrev,ACCESS2_Adj95CI,ARTHRITIS_CrudePrev,ARTHRITIS_Crude95CI,ARTHRITIS_AdjPrev,ARTHRITIS_Adj95CI,BINGE_CrudePrev,BINGE_Crude95CI,BINGE_AdjPrev,BINGE_Adj95CI,BPHIGH_CrudePrev,BPHIGH_Crude95CI,BPHIGH_AdjPrev,BPHIGH_Adj95CI,BPMED_CrudePrev,BPMED_Crude95CI,BPMED_AdjPrev,BPMED_Adj95CI,CANCER_CrudePrev,CANCER_Crude95CI,CANCER_AdjPrev,CANCER_Adj95CI,CASTHMA_CrudePrev,CASTHMA_Crude95CI,CASTHMA_AdjPrev,CASTHMA_Adj95CI,CHD_CrudePrev,CHD_Crude95CI,CHD_AdjPrev,CHD_Adj95CI,CHECKUP_CrudePrev,CHECKUP_Crude95CI,CHECKUP_AdjPrev,CHECKUP_Adj95CI,CHOLSCREEN_CrudePrev,CHOLSCREEN_Crude95CI,CHOLSCREEN_AdjPrev,CHOLSCREEN_Adj95CI,COLON_SCREEN_CrudePrev,COLON_SCREEN_Crude95CI,COLON_SCREEN_AdjPrev,COLON_SCREEN_Adj95CI,COPD_CrudePrev,COPD_Crude95CI,COPD_AdjPrev,COPD_Adj95CI,COREM_CrudePrev,COREM_Crude95CI,COREM_AdjPrev,COREM_Adj95CI,COREW_CrudePrev,COREW_Crude95CI,COREW_AdjPrev,COREW_Adj95CI,CSMOKING_CrudePrev,CSMOKING_Crude95CI,CSMOKING_AdjPrev,CSMOKING_Adj95CI,DENTAL_CrudePrev,DENTAL_Crude95CI,DENTAL_AdjPrev,DENTAL_Adj95CI,DIABETES_CrudePrev,DIABETES_Crude95CI,DIABETES_AdjPrev,DIABETES_Adj95CI,HIGHCHOL_CrudePrev,HIGHCHOL_Crude95CI,HIGHCHOL_AdjPrev,HIGHCHOL_Adj95CI,KIDNEY_CrudePrev,KIDNEY_Crude95CI,KIDNEY_AdjPrev,KIDNEY_Adj95CI,LPA_CrudePrev,LPA_Crude95CI,LPA_AdjPrev,LPA_Adj95CI,MAMMOUSE_CrudePrev,MAMMOUSE_Crude95CI,MAMMOUSE_AdjPrev,MAMMOUSE_Adj95CI,MHLTH_CrudePrev,MHLTH_Crude95CI,MHLTH_AdjPrev,MHLTH_Adj95CI,OBESITY_CrudePrev,OBESITY_Crude95CI,OBESITY_AdjPrev,OBESITY_Adj95CI,PAPTEST_CrudePrev,PAPTEST_Crude95CI,PAPTEST_AdjPrev,PAPTEST_Adj95CI,PHLTH_CrudePrev,PHLTH_Crude95CI,PHLTH_AdjPrev,PHLTH_Adj95CI,SLEEP_CrudePrev,SLEEP_Crude95CI,SLEEP_AdjPrev,SLEEP_Adj95CI,STROKE_CrudePrev,STROKE_Crude95CI,STROKE_AdjPrev,STROKE_Adj95CI,TEETHLOST_CrudePrev,TEETHLOST_Crude95CI,TEETHLOST_AdjPrev,TEETHLOST_Adj95CI,Geolocation,OBESITY_cut,_merge
278,Springfield,IL,"Springfield, IL",-0.1,,0.49375,IL,Springfield,1772000,116250,10.6,"(10.2, 10.9)",10.9,"(10.6, 11.2)",27.2,"(27.0, 27.5)",25.2,"(25.0, 25.4)",19.2,"(19.1, 19.3)",20.0,"(19.9, 20.2)",33.2,"(33.0, 33.4)",31.2,"(31.0, 31.4)",77.1,"(76.9, 77.3)",61.2,"(61.0, 61.4)",7.0,"( 6.9, 7.0)",6.4,"( 6.3, 6.4)",9.9,"( 9.8, 10.1)",10.0,"( 9.9, 10.1)",6.7,"( 6.6, 6.8)",6.0,"( 5.9, 6.1)",69.5,"(69.4, 69.7)",68.4,"(68.3, 68.7)",75.6,"(75.3, 75.9)",74.4,"(74.1, 74.7)",63.9,"(63.2, 64.5)",63.8,"(63.2, 64.5)",7.3,"( 7.1, 7.5)",6.8,"( 6.7, 7.0)",34.5,"(33.1, 36.0)",34.6,"(33.2, 35.9)",29.8,"(28.7, 30.9)",30.7,"(29.7, 31.8)",20.3,"(19.8, 20.8)",20.8,"(20.4, 21.3)",60.8,"(60.2, 61.4)",60.4,"(59.8, 61.0)",10.4,"(10.3, 10.5)",9.6,"( 9.5, 9.7)",37.8,"(37.6, 38.0)",32.2,"(32.1, 32.4)",3.1,"( 3.0, 3.1)",2.9,"( 2.8, 2.9)",25.4,"(25.0, 25.9)",25.0,"(24.5, 25.4)",77.5,"(76.8, 78.1)",73.0,"(72.2, 73.6)",12.4,"(12.2, 12.6)",12.6,"(12.4, 12.8)",35.6,"(35.3, 35.9)",35.9,"(35.6, 36.2)",84.3,"(83.9, 84.7)",79.7,"(79.3, 80.2)",13.2,"(12.9, 13.4)",12.7,"(12.4, 12.9)",34.2,"(34.0, 34.5)",34.7,"(34.4, 35.0)",3.4,"( 3.3, 3.5)",3.1,"( 3.1, 3.2)",15.3,"(14.4, 16.4)",15.4,"(14.4, 16.3)","(39.77164733220, -89.6540780049)",high,both
279,Springfield,MA,"Springfield, MA",-0.478571,0.0,0.25,MA,Springfield,2567000,153060,15.9,"(15.6, 16.4)",16.1,"(15.8, 16.6)",27.5,"(27.3, 27.6)",28.4,"(28.2, 28.6)",16.3,"(16.2, 16.5)",15.9,"(15.8, 16.0)",31.7,"(31.5, 31.8)",32.8,"(32.6, 32.9)",75.8,"(75.7, 76.0)",61.8,"(61.7, 62.0)",5.5,"( 5.5, 5.5)",5.8,"( 5.7, 5.8)",12.7,"(12.6, 12.9)",12.6,"(12.5, 12.8)",7.4,"( 7.3, 7.5)",7.7,"( 7.6, 7.8)",78.5,"(78.3, 78.6)",78.8,"(78.7, 78.9)",71.3,"(71.0, 71.7)",73.2,"(72.9, 73.5)",59.6,"(59.0, 60.1)",60.1,"(59.5, 60.6)",8.2,"( 8.0, 8.4)",8.4,"( 8.3, 8.6)",27.0,"(26.2, 27.9)",27.3,"(26.5, 28.2)",25.2,"(24.5, 25.9)",25.9,"(25.2, 26.6)",23.5,"(23.1, 24.0)",23.8,"(23.4, 24.2)",58.7,"(58.1, 59.3)",58.5,"(57.9, 59.0)",13.3,"(13.2, 13.4)",13.8,"(13.7, 14.0)",36.0,"(35.8, 36.2)",32.6,"(32.5, 32.8)",3.5,"( 3.5, 3.5)",3.6,"( 3.6, 3.7)",34.5,"(34.1, 35.0)",35.1,"(34.7, 35.6)",83.3,"(82.9, 83.7)",77.3,"(76.8, 77.8)",17.1,"(16.9, 17.4)",16.9,"(16.7, 17.2)",36.2,"(36.0, 36.4)",37.2,"(37.1, 37.4)",84.3,"(84.0, 84.6)",77.0,"(76.6, 77.4)",16.5,"(16.3, 16.8)",16.9,"(16.6, 17.1)",38.9,"(38.7, 39.1)",39.2,"(39.0, 39.4)",4.0,"( 3.9, 4.1)",4.2,"( 4.1, 4.2)",26.0,"(24.8, 27.3)",26.1,"(24.9, 27.3)","(42.11549779990, -72.5395254143)",high,both


In [182]:
prediction6.drop_duplicates(inplace=True)

In [183]:
len(prediction6)

327

In [184]:
# output data to pickle


pd.to_pickle(prediction6, 'data/cities_sentiment_data.pkl')