In [1]:
import pandas as pd
import numpy as np
import re
import plotly.express as px
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
words = set(nltk.corpus.words.words())

In [44]:
#Import the data and clean the DF
df = pd.read_csv("data/aggregated_DB.csv", low_memory=False)
df = df.drop_duplicates(subset='tweet', keep="first")
df.to_csv("data/aggregated_DB.csv")

df = df[['date', 'tweet']]
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d %H:%M:%S')
print(df.shape)

(1587867, 2)


In [49]:
#___________________________settings for estimation ____________________
#Threshold for sentiment classification => range: (1,0)
continuous_or_discrete = "disc"
threshold = 0.3
#Use Naive Bayes to classify into relevance scores?
bayes = 1 #this option is a work in progress
#_______________________________________________________________________

#define a DF to store the last estimation parameters
settings = {'Setting':['cont_or_disc', 'threshold', 'bayes',],
               'value':[continuous_or_discrete, threshold, bayes]}

settings_df = pd.DataFrame(settings)
settings_df.to_csv(f"results/settings_{continuous_or_discrete}.csv")
print(settings_df)

        Setting value
0  cont_or_disc  disc
1     threshold   0.3
2         bayes     1


In [3]:
#Cleaning
#Define a function to clean tweet of special characters and whitespace using regex
def cleaner(tweet):
    tweet = re.sub("@[A-Za-z0-9]+","",str(tweet)) #Remove @ sign
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", str(tweet)) #Remove http links
    tweet = " ".join(tweet.split())
    tweet = tweet.replace("#", "").replace("_", " ") #Remove hashtag sign but keep the text
    tweet = " ".join(w for w in nltk.wordpunct_tokenize(tweet)
         if w.lower() in words or not w.isalpha())
    return tweet

df['tweet_clean'] = df['tweet'].apply(cleaner)
df = df.drop('tweet', axis=1)
df = df.drop_duplicates(subset='tweet_clean', keep="first")

In [4]:
#Calculating Sentiment
#Uncomment the Following two lines if not already downloaded
#Lines below 
#nltk.download('vader_lexicon')
#nltk.download('words')

list1 = []
for i in df['tweet_clean']:
    list1.append((sid.polarity_scores(str(i)))['compound'])

df['sentiment'] = pd.Series(list1)
df.to_csv('temp/tempsave.csv')

In [28]:
df = pd.read_csv('temp/tempsave.csv')
#Import csv of keywords derived from michigan survey and create lists
df_kws = pd.read_csv("kws/kws.csv")
kw_list = df_kws['Indicator'].values.tolist()
category_list = df_kws['Category'].values.tolist()
category_list = [*set(category_list)]

#Detect which keyword was used and store in DF
df['Indicator'] = np.nan
for term in kw_list:
    df['Indicator'] = np.where(df['tweet_clean'].str.contains(term, case=False) == True, term, df['Indicator'])

#Merge in df containing keyword categories
df = pd.merge(df_kws, df, on='Indicator')

df = df[['date', 'sentiment', 'Indicator', 'Category']]

In [29]:
#Map continuous score to discrete if the option is selected
if continuous_or_discrete == 'disc':
    df['sentiment_score'] = pd.cut(x=df['sentiment'], 
                                   bins=[-1.1,-.03,threshold,1.1],
                                   labels =[-1,0,1])
else:
    df['sentiment_score'] = df['sentiment']

df['sentiment_score'] = pd.factorize(df['sentiment_score'])[0] - 1

df.tail()

Unnamed: 0,date,sentiment,Indicator,Category,sentiment_score
1126484,2018-01-29 20:30:37,0.97,Cost of living,prices,1
1126485,2018-01-29 20:26:52,0.6908,Cost of living,prices,1
1126486,2018-01-29 20:25:18,-0.296,Cost of living,prices,-1
1126487,2018-01-29 20:25:13,0.7779,Cost of living,prices,1
1126488,2018-01-29 20:23:35,-0.2023,Cost of living,prices,-1


In [30]:
df_t = df.pivot_table(index='date', columns='Category', values='sentiment_score', aggfunc='mean')
df_counts = df.pivot_table(index='date', columns='Category', values='sentiment_score', aggfunc='count')
df_t.head()
df_t.to_csv("temp/temp4.csv")
df_counts.to_csv("temp/counts.csv")

In [35]:
df= pd.read_csv("temp/temp4.csv")
#df= pd.read_csv("temp/counts.csv")

print(df.shape)
df = df.reset_index()
df_c = df[df['date'].str.contains( "'" )==False ]
print(df.shape)
df_c['date'] = pd.to_datetime(df_c['date'], format='%Y-%m-%d %H:%M:%S')
df_c = df_c.set_index('date')
agg = df_c.resample('D').mean()
print(agg.shape)
cat_list_roll = []
for cat in category_list:   
    agg[cat +"_roll"] = agg[cat].rolling(30).mean()
    cat_list_roll.append([cat +"_roll"])

print(cat_list_roll)

agg.to_csv(f"results/current_trend_{continuous_or_discrete}.csv")
fig = px.line(agg, x=agg.index, y=["general_roll","prices_roll","policy_roll"], title='Category Sentiment')
fig.show()

(1072564, 5)
(1072564, 6)
(1674, 5)
[['prices_roll'], ['personal_roll'], ['policy_roll'], ['general_roll']]


In [34]:
agg.head()

Unnamed: 0_level_0,index,general,personal,policy,prices,prices_roll,personal_roll,policy_roll,general_roll
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-12-31,255.5,0.113269,-0.128205,-0.05,0.020134,,,,
2018-01-01,794.5,-0.124625,-0.05,-0.103448,0.108187,,,,
2018-01-02,1360.5,-0.135314,-0.06338,-0.043478,0.244382,,,,
2018-01-03,1937.5,0.071429,-0.454545,0.210526,0.044199,,,,
2018-01-04,2546.0,0.078797,-0.444444,0.051282,0.019663,,,,


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

agg_full = agg.apply(lambda row: row.fillna(row.mean()), axis=1)

#prepare the input series (x) by detrending and standardizing
x = StandardScaler().fit_transform(agg_full)

#Calculate the first  principal component & output explained variance
pca = PCA(n_components=1)
df_PC = pca.fit_transform(x)
print(pca.explained_variance_ratio_)
save_date = ['Sentiment']

df_PC_merge = pd.DataFrame(df_PC, columns = ['Sentiment'])

df1 = pd.concat([save_date, df_PC_merge], axis=1)
df_final = df1.set_index('date')
#df_final.to_csv("/tetemp.csv")



[0.26063161]


In [None]:
import plotly.express as px
df_final.to_csv("export.csv")
df_final['Sentiment'] = df_final['Sentiment']*(-1)
fig = px.line(df_final, x=df_final.index, y=["Sentiment"], title='PC Sentiment')
fig.show()

In [None]:
#This is used for inspecting the values
start_date = '2017-12-01'
end_date = '2018-01-31'
mask = (df['date'] > start_date) & (df['date'] <= end_date)
df_trim = df.loc[mask]

df_trim = df_trim[df_trim['sentiment']>-0.8]
df_trim = df_trim[['date','tweet_clean','sentiment']].copy()
df_trim.to_csv("inspect_high.csv")

In [None]:
#This stores obs. per day
df = df.reset_index()
df = df[df['date'].str.contains( "'" )==False ]
df_temp = df[['date', 'counter']]
df_temp['date'] = pd.to_datetime(df_temp['date'], format='%Y-%m-%d %H:%M:%S')

df_temp = df_temp.set_index('date')
df_temp = df_temp.resample('D').sum()
fig2 = px.line(df_temp, x=df_temp.index, y='counter', title='#obs per day')
fig2.show()


In [None]:
#This removes those that have already been estimated
if re_est != 1:
    df_already_covered = pd.read_csv(f"results/current_trend_{continuous_or_discrete}.csv")
    df_already_covered = df_already_covered.reset_index()
    df_already_covered
    df_trim = pd.DataFrame()
    for i in df.iteritems():
        df_trim = df[~(df['date'].day == df_already_covered['date'](i).day)]

df_trim.head()