In [2]:
import json
import time
from datetime import datetime
import pytz
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVR
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

  from pandas.core import datetools


In [3]:
# all the functions

def load_file(files):
    #info of total tweets
    tw_info_total = []
    for line in files :
        #info of current tweets
        tw_info = []
        curr_tw = line
        #time current tweet is posted
        date = curr_tw['citation_date']
        #follower of this user
        foll = curr_tw['author']['followers']
        #name of author
        author = curr_tw['author']['name']
        #number of retweets
        re_tw = curr_tw['metrics']['citations']['total']
        #store info as (date,author,follower,retweet_count)
        tw_info.append(date)
        tw_info.append(author)
        tw_info.append(foll)
        tw_info.append(re_tw)
        tw_info_total.append(tw_info)
    #convert to dataframe
    df = pd.DataFrame(tw_info_total,columns=['time','author','follower','retweet'])
    df = df.sort_values(by = 'time')
    df = df.reset_index(drop=True)
    
    return df

def get_hour(time_stamp):
    pst_tz = pytz.timezone('US/Pacific')
    return (datetime.fromtimestamp(time_stamp, pst_tz)).hour

def avg_rmse_lr(features,labels):
    kf = KFold(n_splits=10, random_state=0, shuffle=True)

    test_mse_list = []

    for train_index, test_index in kf.split(features):
        X_train, X_test = features[train_index], features[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
    
        linearregression.fit(X_train, y_train)
        
        # Compute mse for test set
        y_test_pred = linearregression.predict(X_test)
        test_mse_list.append(mean_squared_error(y_test, y_test_pred))
    
    avg_test_rmse = np.sqrt(np.mean(test_mse_list))

    print ('RMSE for linear regression model is', avg_test_rmse)

def avg_rmse_rf(features,labels):
    kf = KFold(n_splits=10, random_state=0, shuffle=True)

    test_mse_list = []

    for train_index, test_index in kf.split(features):
        X_train, X_test = features[train_index], features[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
    
        rf_regressor.fit(X_train, y_train)
        
        # Compute mse for test set
        y_test_pred = rf_regressor.predict(X_test)
        test_mse_list.append(mean_squared_error(y_test, y_test_pred))
    
    avg_test_rmse = np.sqrt(np.mean(test_mse_list))

    print ('RMSE for Random Forest Regressor model is', avg_test_rmse)

def avg_rmse_svm(features,labels):
    kf = KFold(n_splits=10, random_state=0, shuffle=True)

    test_mse_list = []

    for train_index, test_index in kf.split(features):
        X_train, X_test = features[train_index], features[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
    
        svm.fit(X_train, y_train)
        
        # Compute mse for test set
        y_test_pred = svm.predict(X_test)
        test_mse_list.append(mean_squared_error(y_test, y_test_pred))
    
    avg_test_rmse = np.sqrt(np.mean(test_mse_list))

    print ('RMSE for LinearSVR model is', avg_test_rmse)
    
    
def get_features(df):
    tw_tot = 0
    retweet_tot = 0
    foll_tot = 0
    author_visited_dict = dict()
    foll_max = 0
    start_hour = get_hour(df.iloc[0,0])
    
    feature_list_tot = []
    feature_list_curr = []
    
            
    for index, row in df.iterrows():
        curr_hour = get_hour(row['time'])
        curr_author = row['author']
        curr_foll = row['follower'] 
        curr_retweet = row['retweet']
        
        if curr_hour == start_hour:
            tw_tot += 1
            retweet_tot += curr_retweet
            foll_max = max(foll_max,curr_foll)
            #author visited or not
            if (curr_author not in author_visited_dict):
                foll_tot += curr_foll
                author_visited_dict[curr_author] = True;
            
        
        else:
            feature_list_curr.append(start_hour)
            feature_list_curr.append(tw_tot)
            feature_list_curr.append(retweet_tot)
            feature_list_curr.append(foll_tot)
            feature_list_curr.append(foll_max)
            feature_list_tot.append(feature_list_curr)
        
            #setup the counters
            tw_tot = 1
            start_hour = curr_hour
            retweet_tot = curr_retweet
            foll_tot = curr_foll
            foll_max = curr_foll
            author_visited_dict.clear()
            feature_list_curr = []
        
    feature_df = pd.DataFrame(feature_list_tot,columns=['time','tweets_total','retweets_total','followers_total','max_followers'])
    return feature_df

def cross_validation_3period(data):
    # split tweets
    time1 = 1422806400 #20150201 8:00am
    time2 = 1422849600 #20150201 8:00pm

    data_1 = []
    data_2 = []
    data_3 = []
    for i in range(0,len(data)):
        tweet = data[i]
        time = tweet["firstpost_date"]
        if   time < time1: 
             data_1.append(tweet)
        elif time >= time1 and time < time2: 
             data_2.append(tweet)
        else: 
             data_3.append(tweet)

    # first period
    df_1 = load_file(data_1)
    feature_df_1 = get_features(df_1)
    y_1 = np.nan_to_num(feature_df_1['tweets_total'].values)
    feature_df_1.drop(columns =['tweets_total'],inplace = True)
    x_1 = np.nan_to_num(feature_df_1.values)

    # rmse_1
    print "first period"
    avg_rmse_lr(x_1,y_1)
    avg_rmse_rf(x_1,y_1)
    avg_rmse_svm(x_1,y_1)
    
    # second period
    df_2 = load_file(data_2)
    feature_df_2 = get_features(df_2)
    y_2 = np.nan_to_num(feature_df_2['tweets_total'].values)
    feature_df_2.drop(columns =['tweets_total'],inplace = True)
    x_2 = np.nan_to_num(feature_df_2.values)    

    # rmse_2
    print "second period"
    avg_rmse_lr(x_2,y_2)
    avg_rmse_rf(x_2,y_2)
    avg_rmse_svm(x_2,y_2)
    
    
    # third period
    df_3 = load_file(data_3)
    feature_df_3 = get_features(df_3)
    y_3 = np.nan_to_num(feature_df_3['tweets_total'].values)
    feature_df_3.drop(columns =['tweets_total'],inplace = True)
    x_3 = np.nan_to_num(feature_df_3.values)    

    # rmse_3
    print "third period"
    avg_rmse_lr(x_3,y_3)
    avg_rmse_rf(x_3,y_3)
    avg_rmse_svm(x_3,y_3)

In [5]:
path = "tweet_data/"

files = ["tweets_#gohawks.txt", "tweets_#gopatriots.txt", 
        "tweets_#nfl.txt", "tweets_#patriots.txt", 
        "tweets_#sb49.txt", "tweets_#superbowl.txt"]

for index, name in enumerate(files):
    print ("files[" + str(index) + "] => " + name)
    
    


files[0] => tweets_#gohawks.txt
files[1] => tweets_#gopatriots.txt
files[2] => tweets_#nfl.txt
files[3] => tweets_#patriots.txt
files[4] => tweets_#sb49.txt
files[5] => tweets_#superbowl.txt


In [4]:
linearregression = LinearRegression()
rf_regressor = RandomForestRegressor(n_estimators=13,
                             max_features=3,
                             max_depth=11,
                             bootstrap=True,
                             oob_score=True,
                             random_state=0)
svm = LinearSVR(random_state = 0)

# Problem 1.4 (i)

In [19]:
# hashtag tweets_#gohawks 

# load #gohawks
f = open('tweet_data/tweets_#gohawks.txt')
f_start = f.tell()
f.seek(f_start)
gohawks = []

for line in f.readlines():
    tweet = json.loads(line)
    gohawks.append(tweet)

print 'hashtag gohawks'
cross_validation_3period(gohawks)

hashtag gohawks
first period
('RMSE for linear regression model is', 167.03768838768028)


  warn("Some inputs do not have OOB scores. "


('RMSE for Random Forest Regressor model is', 727.5054539499181)
('RMSE for LinearSVR model is', 803.8749796798273)
second period
('RMSE for linear regression model is', 1630.8844284398092)
('RMSE for Random Forest Regressor model is', 1596.6966920583197)
('RMSE for LinearSVR model is', 2819.767432446993)
third period
('RMSE for linear regression model is', 110.48743810108999)
('RMSE for Random Forest Regressor model is', 72.61467506221122)
('RMSE for LinearSVR model is', 1004.4379343756998)


In [20]:
# hashtag tweets_#gopatriots 
# load #gopatriots
f = open('tweet_data/tweets_#gopatriots.txt')
f_start = f.tell()
f.seek(f_start)
gopatriots = []

for line in f.readlines():
    tweet = json.loads(line)
    gopatriots.append(tweet)

print 'hashtag gopatriots'
cross_validation_3period(gopatriots)  

hashtag gopatriots
first period
('RMSE for linear regression model is', 27.738876380824657)
('RMSE for Random Forest Regressor model is', 32.68739272532083)
('RMSE for LinearSVR model is', 235.4174696983586)
second period
('RMSE for linear regression model is', 200.06407907757685)
('RMSE for Random Forest Regressor model is', 482.0313536865456)
('RMSE for LinearSVR model is', 4368.305756552469)
third period
('RMSE for linear regression model is', 10.841963798610175)
('RMSE for Random Forest Regressor model is', 21.621744205433732)
('RMSE for LinearSVR model is', 31.57203096725895)


In [22]:
# hashtag tweets_#nfl 
# load #nfl
f = open('tweet_data/tweets_#nfl.txt')
f_start = f.tell()
f.seek(f_start)
nfl = []

for line in f.readlines():
    tweet = json.loads(line)
    nfl.append(tweet)

print 'hashtag nfl'
cross_validation_3period(nfl) 

hashtag nfl
first period
('RMSE for linear regression model is', 155.9752610964478)
('RMSE for Random Forest Regressor model is', 224.45696125859274)
('RMSE for LinearSVR model is', 1273.4563794096803)
second period
('RMSE for linear regression model is', 1227.4562012989693)
('RMSE for Random Forest Regressor model is', 1258.7458206194854)
('RMSE for LinearSVR model is', 6052.015182532136)
third period
('RMSE for linear regression model is', 169.898347617545)
('RMSE for Random Forest Regressor model is', 132.92495049017492)
('RMSE for LinearSVR model is', 1275.2003070593773)


In [23]:
# hashtag tweets_#patriots 
# load #patriots
f = open('tweet_data/tweets_#patriots.txt')
f_start = f.tell()
f.seek(f_start)
patriots = []

for line in f.readlines():
    tweet = json.loads(line)
    patriots.append(tweet)

print 'hashtag patriots'
cross_validation_3period(patriots) 

hashtag patriots
first period
('RMSE for linear regression model is', 252.2388884646137)
('RMSE for Random Forest Regressor model is', 420.2640231172712)
('RMSE for LinearSVR model is', 837.5325952326065)
second period
('RMSE for linear regression model is', 6739.312504509933)
('RMSE for Random Forest Regressor model is', 4619.455209069529)
('RMSE for LinearSVR model is', 15040.273369849503)
third period
('RMSE for linear regression model is', 149.74025062904835)
('RMSE for Random Forest Regressor model is', 206.71995392030684)
('RMSE for LinearSVR model is', 1926.2940234028515)


In [5]:
# hashtag tweets_#sb49 
# load #sb49
f = open('tweet_data/tweets_#sb49.txt')
f_start = f.tell()
f.seek(f_start)
sb49 = []

for line in f.readlines():
    tweet = json.loads(line)
    sb49.append(tweet)

print 'hashtag sb49'
cross_validation_3period(sb49) 

hashtag sb49
first period
('RMSE for linear regression model is', 140.24665457188496)


  warn("Some inputs do not have OOB scores. "


('RMSE for Random Forest Regressor model is', 123.15885969349496)
('RMSE for LinearSVR model is', 8416.81182723944)
second period
('RMSE for linear regression model is', 20339.00615419173)
('RMSE for Random Forest Regressor model is', 12320.226618078446)
('RMSE for LinearSVR model is', 326388.1843666363)
third period
('RMSE for linear regression model is', 322.44149973391967)
('RMSE for Random Forest Regressor model is', 479.58561606692797)
('RMSE for LinearSVR model is', 12081.769207523153)


In [6]:
# hashtag tweets_#superbowl 
# load #superbowl
f = open('tweet_data/tweets_#superbowl.txt')
f_start = f.tell()
f.seek(f_start)
superbowl = []

for line in f.readlines():
    tweet = json.loads(line)
    superbowl.append(tweet)

print 'hashtag superbowl'
cross_validation_3period(superbowl) 

MemoryError: 

# (ii）

In [None]:
# all hashtags 
# By observing the top 3 features of each hashtag, we decided to choose 4 features, excluding 'time', to predict and fit the model
# load #gohawks
allhashtag = ['tweets_#gohawks.txt','tweets_#gopatriots.txt','tweets_#nfl.txt',
              'tweets_#patriots.txt','tweets_#sb49.txt','tweets_#superbowl.txt']

for hashtag in allhashtag:
    f = open ('tweet_data/' + hashtag)
    f_start = f.tell()
    f.seek(f_start)
    all_hashtag = []

    for line in f.readlines():
        tweet = json.loads(line)
        all_hashtag.append(tweet)

print 'all hashtag'
cross_validation_3period(all_hashtag)

# Part 1.5

In [None]:
# first period testing data
period_1_samples = ['sample1_period1','sample4_period1','sample5_period1','sample8_period1']
period_2_samples = ['sample2_period2','sample6_period2','sample9_period2']
period_3_samples = ['sample3_period3','sample7_period3','sample10_period3']

for sample in period_1_samples
    f = open ('test_data/' + sample)
    f_start = f.tell()
    f.seek(f_start)
    period_1_test = []

    for line in f.readlines():
        tweet = json.loads(line)
        period_1_test.append(tweet)
        
for sample in period_2_samples
    f = open ('test_data/' + sample)
    f_start = f.tell()
    f.seek(f_start)
    period_2_test = []

    for line in f.readlines():
        tweet = json.loads(line)
        period_2_test.append(tweet)
        
for sample in period_3_samples
    f = open ('test_data/' + sample)
    f_start = f.tell()
    f.seek(f_start)
    period_3_test = []

    for line in f.readlines():
        tweet = json.loads(line)
        period_3_test.append(tweet)
        
# split tweets
time1 = 1422806400 #20150201 8:00am
time2 = 1422849600 #20150201 8:00pm

data_1 = []
data_2 = []
data_3 = []
for i in range(0,len(all_hashtag)):
    tweet = all_hashtag[i]
    time = tweet["firstpost_date"]
    if   time < time1: 
         data_1.append(tweet)
    elif time >= time1 and time < time2: 
         data_2.append(tweet)
    else: 
         data_3.append(tweet)

df_1 = load_file(data_1)
feature_df_1 = get_features(df_1)
y_1 = np.nan_to_num(feature_df_1['tweets_total'].values)
feature_df_1.drop(columns =['tweets_total'],inplace = True)
x_1 = np.nan_to_num(feature_df_1.values)
rf_regressor.fit(x_1, y_1)

df_test_1 = load_file(period_1_test)
feature_df_test_1 = get_features(df_test_1)
y_test_1 = np.nan_to_num(feature_df_test_1['tweets_total'].values)
feature_df_test_1.drop(columns =['tweets_total'],inplace = True)
x_test_1 = np.nan_to_num(feature_df_test_1.values)

#calculating rmse for first period test data
y_test_pred_1 = rf_regressor.predict(x_test_1)
rmse_test_1 = np.sqrt(mean_squared_error(y_test_1, y_test_pred))


df_2 = load_file(data_2)
feature_df_2 = get_features(df_2)
y_2 = np.nan_to_num(feature_df_2['tweets_total'].values)
feature_df_2.drop(columns =['tweets_total'],inplace = True)
x_2 = np.nan_to_num(feature_df_2.values)
rf_regressor.fit(x_2, y_2)

df_test_2 = load_file(period_2_test)
feature_df_test_2 = get_features(df_test_2)
y_test_2 = np.nan_to_num(feature_df_test_2['tweets_total'].values)
feature_df_test_2.drop(columns =['tweets_total'],inplace = True)
x_test_2 = np.nan_to_num(feature_df_test_2.values)

#calculating rmse for second period test data
y_test_pred_2 = rf_regressor.predict(x_test_2)
rmse_test_2 = np.sqrt(mean_squared_error(y_test_2, y_test_pred_2))


df_3 = load_file(data_3)
feature_df_3 = get_features(df_3)
y_3 = np.nan_to_num(feature_df_3['tweets_total'].values)
feature_df_3.drop(columns =['tweets_total'],inplace = True)
x_3 = np.nan_to_num(feature_df_3.values)
rf_regressor.fit(x_3, y_3)

df_test_3 = load_file(period_3_test)
feature_df_test_3 = get_features(df_test_3)
y_test_3 = np.nan_to_num(feature_df_test_3['tweets_total'].values)
feature_df_test_3.drop(columns =['tweets_total'],inplace = True)
x_test_3 = np.nan_to_num(feature_df_test_3.values)

#calculating rmse for second period test data
y_test_pred_3 = rf_regressor.predict(x_test_3)
rmse_test_3 = np.sqrt(mean_squared_error(y_test_3, y_test_pred_3))