**Read the csv merged out of tweets collected for the 3 popular spots by bhagya**

In [1]:
# read in the LA city data from Oct 2017-Dec 2017

import pandas as pd
df = pd.read_csv('./Raw_data/full_LA_Oct-Dec2017.csv', sep=';', low_memory=False, usecols=['retweets','favorites','text','hashtags'])


In [2]:
# This step is to convert the text and hashtags column to string
# Then count the number of '#' in each cell in hashtags column to get the hashtag counts.

df['text'] = df.text.astype(str)
df['hashtags'] = df.hashtags.astype(str)
counts = []
for row in df['hashtags']:
    counts.append(row.strip().count('#'))
df['hashtagCounts'] = counts


**getting the popular place info from the google api through code**

In [3]:
# Extract points of interests from API Google search for LA City. Saving the list to lst_data

import urllib.request, json
import pprint

URL2 = "https://maps.googleapis.com/maps/api/place/textsearch/json?query=los+angeles+point+of+interest&language=en&key=AIzaSyBIu7cZl56v5JgQw1mMXeMlm7k2DukNaWI"

googleResponse = urllib.request.urlopen(URL2)
jsonResponse = json.loads(googleResponse.read())
lst_data = jsonResponse['results']


**extracting the popular place list from json**

In [4]:
# In addition to getting the point of interest above, another code was written to extract the place names from
# tweets. That way we wanted to ensure we had all the places in the list. So in the below step, we are mergin the 
# list generated from the above step and the other list.

import pandas as pd

poi_list = []
for data in lst_data:
    poi_list.append(data['name'])
poi_list 
print(poi_list)

wholeListOfPoi=[]
df_poi = pd.read_csv('./pointOfIntersts.csv',header=None)
wholeListOfPoi = df_poi[0]

poi_list.extend(list(wholeListOfPoi))

print(poi_list)

['El Pueblo de Los Angeles Historical Monument', 'Petersen Automotive Museum', 'Hollywood Sign', 'Korean Friendship Bell', 'Bradbury Building', 'Griffith Observatory', 'Wat Thai of Los Angeles', 'The Getty', 'Hollywood Walk of Fame', 'Griffith Park', 'Venice Canals', 'Urban Light', 'Watts Towers State Historic Park', 'LA Waterfront', 'Los Angeles County Museum of Art', 'Our Lady Queen of Angels Catholic Church', 'Universal Studios Hollywood', 'Autry Museum of the American West', 'Statue "The Four Ladies of Hollywood"', 'Hollywood Bowl']
['El Pueblo de Los Angeles Historical Monument', 'Petersen Automotive Museum', 'Hollywood Sign', 'Korean Friendship Bell', 'Bradbury Building', 'Griffith Observatory', 'Wat Thai of Los Angeles', 'The Getty', 'Hollywood Walk of Fame', 'Griffith Park', 'Venice Canals', 'Urban Light', 'Watts Towers State Historic Park', 'LA Waterfront', 'Los Angeles County Museum of Art', 'Our Lady Queen of Angels Catholic Church', 'Universal Studios Hollywood', 'Autry Mus

In [5]:
# Once we have the place of interest, we are creating bigram and trigram in a dictionary with key as the popular
# tourist spot and values will be from bigram/trigram.

from nltk import everygrams
from nltk import word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict

d = defaultdict(list)

stop_words = set(stopwords.words('english'))

for poi in poi_list:
    word_token = word_tokenize(poi.lower())
    word_stop = [token for token in word_token if token not in stop_words]
    grams = everygrams(word_stop, 2, 3)
    for i in grams:
        d[poi].append(i)

        
#print(d.items())

** Searching and tagging each tweet with popular_spot name found in text, adding column to the df with that popular_spot name**

In [6]:
# Searching for the bigram, trigram in the tweet message and assigning it the tourist spot name

def search_str(text):
    for key, value in d.items():
        for v in value:
            v = " ".join(v)
            if v in text.lower().strip():
                return key

df['popular_spot'] = df['text'].apply(search_str)

#df.to_csv('bigramoutput.csv')
#df.head(100)


**Extracting the total tweets and adding total favorites done per popular spots . We can even total the retweets happening for tweets per popular_spot**

In [9]:
# in the below process, we are creating our sentiment model and saving it for reuse.

from sklearn.externals import joblib
filename = 'sentiment_model3.0.sav'
clf = joblib.load(filename)
filename = 'tfidf_vectorizer3.0.sav'
tv = joblib.load(filename)


In [10]:
# This step is to actually create a prediction score for each tweet using the model. The dataframe is passed to next
# step to get the aggregate sums. Also is saved to a file for review.

tweet_tfidf = tv.transform(df['text'].astype(str))
tweet_pred = clf.predict(tweet_tfidf)
df["sent_pred"] = tweet_pred
df.to_csv("DataNew.csv")


In [37]:
# Group the dataframe by popular spot and get the aggregate counts for favorites, retweets, hashtagcounts, and mean for the prediction

import numpy as np
testdf = df.groupby('popular_spot')
df_extract = testdf.agg({'favorites':np.sum, 'retweets':np.sum, 'hashtagCounts':np.sum, 'sent_pred': np.mean})
df_extract.sort_values(by=['favorites'], ascending=False)


Unnamed: 0_level_0,favorites,retweets,hashtagCounts,sent_pred
popular_spot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
El Pueblo de Los Angeles Historical Monument,103105,17505,25916,3.890009
santa monica,16677,3623,2627,3.868899
beverly hills,3032,545,1723,3.851136
Universal Studios Hollywood,1968,416,685,3.946269
Hollywood Bowl,1492,149,523,3.921909
west hollywood,1459,236,970,3.958549
venice beach,1169,107,1018,3.881818
Hollywood Sign,588,95,385,3.926316
Hollywood Walk of Fame,569,80,592,3.989783
north hollywood,520,100,404,3.679767


In [34]:
import numpy as np
import pandas
import csv
from sklearn import preprocessing

testdf = df.groupby('popular_spot')
df_extract = testdf.agg({'popular_spot':np.size, 'favorites':np.sum, 'retweets':np.sum, 'hashtagCounts':np.size, 'sent_pred':np.sum})
df_extract.sort_values(by=['favorites'], ascending=False)


x = df_extract.ix[:,[1,2,3,4]].values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)


df1 = pandas.DataFrame(x_scaled)
df1.index = df_extract.index
df1['mean'] = df1.mean(axis=1)
df2 = df1.sort_values(by=['mean'], ascending=False)
print(df1.sort_values(by=['mean'], ascending=False))

df2.iloc[0:20,4].to_csv('FinalReport.csv')


                                                     0         1         2  \
popular_spot                                                                 
El Pueblo de Los Angeles Historical Monument  1.000000  1.000000  1.000000   
santa monica                                  0.161748  0.206969  0.130273   
beverly hills                                 0.029407  0.031134  0.063539   
west hollywood                                0.014151  0.013482  0.053338   
Universal Studios Hollywood                   0.019087  0.023765  0.041141   
venice beach                                  0.011338  0.006113  0.027007   
Hollywood Walk of Fame                        0.005519  0.004570  0.024027   
north hollywood                               0.005043  0.005713  0.021077   
Hollywood Bowl                                0.014471  0.008512  0.014133   
Hollywood Sign                                0.005703  0.005427  0.011645   
el monte                                      0.000621  0.001314

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
