'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
'Colorado', 'Connecticut', 'Delaware', 'District of Columbia',
'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana',
'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi',
'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
'New Jersey', 'New Mexico', 'New York', 'North Carolina',
'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
'Puerto Rico', 'Rhode Island', 'South Carolina', 'South Dakota',
'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virgin Islands',
'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming'

In [1]:
import pandas as pd
import numpy as np
import re
from textblob import TextBlob

**Load & Process Text Data**

In [2]:
def get_quarter_info(x):
    if x//3 == 0:
        return 1
    elif x//3 == 1:
        return 2
    elif x//3 == 2:
        return 3
    else:
        return 4

**State name**

In [74]:
state_name = "New_York"

**Stitching state tweets from 2012 to 2016**

In [75]:
tweets = pd.DataFrame()
for year in range(2012, 2017):
    temp_tweets = pd.read_csv("collected_tweets/%s/%s_%d.csv" %(state_name, state_name, year))
    temp_tweets = temp_tweets[["date","id","text","year"]]
    temp_tweets.year = temp_tweets.year.apply(lambda x: int(x))
    temp_tweets.date = temp_tweets.date.apply(lambda x: pd.to_datetime(x))
    temp_tweets["sentiment_score"] = np.nan
    temp_tweets["month"] = temp_tweets.date.apply(lambda x: x.month)
    temp_tweets["quarter"] = temp_tweets.month.apply(lambda x: get_quarter_info(x))
    tweets = tweets.append(temp_tweets, ignore_index=True)

**Processing stitched tweets to sentimental variable**

In [76]:
sentiment_proportion = pd.DataFrame()
for year in tweets.year.unique():
    annual_tweets = tweets[tweets.year == year].reset_index(drop=True)
    for quarter in annual_tweets.quarter.unique():
        temp_df = annual_tweets[annual_tweets.quarter == quarter].reset_index(drop=True)
        quarter_text = ""
        for idx in range(temp_df.shape[0]):
            temp_text = " ".join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t]) |(\w+:\/\/\S+)", " ", temp_df.loc[idx, "text"]).split())
            quarter_text += (" " + temp_text)
        quarter_word_list = quarter_text.split()
        total_num = len(quarter_word_list)
        pos_num = 0
        nue_num = 0
        neg_num = 0
        for word in quarter_word_list:
            blob = TextBlob(word)
            sent_score = blob.sentiment.polarity
            if sent_score > 0:
                pos_num += 1
            elif sent_score < 0:
                neg_num += 1
            else:
                nue_num += 1
        temp_percentage = {"year": year,
                           "quarter": quarter, 
                           "positive": pos_num/total_num, 
                           "neutral": nue_num/total_num, 
                           "negative": neg_num/total_num}
        sentiment_proportion = sentiment_proportion.append(temp_percentage, ignore_index=True)

In [77]:
sentiment_proportion = sentiment_proportion.sort_values(["year", "quarter"])
sentiment_proportion["timestamp"] = sentiment_proportion.year.apply(lambda x: str(int(x)) + "_") \
                                    + sentiment_proportion.quarter.apply(lambda x: str(int(x)))

In [78]:
sentiment_proportion.to_csv("sentiment_from_tweets/%s_sentiment.csv" % state_name)