In [1]:
import json
import time
from datetime import datetime
import pytz
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVR
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

  from pandas.core import datetools


In [2]:
# all the functions

def load_file_new(file):
    #info of total tweets
    tw_info_total = []
    for line in open(file) :
        tw_info = []
        curr_tw = json.loads(line)
        #time current tweet is posted
        date = curr_tw['citation_date']
        #author of this tweet
        author = curr_tw['author']['name']
        #follower of this user
        foll = curr_tw['author']['followers']
        #length of tweet
        len_tweet = len(curr_tw['tweet']['text'])
        #favorite count
        favor_ct = curr_tw['tweet']['favorite_count']
        #user mentioned count
        user_ment_ct = len(curr_tw['tweet']['entities']['user_mentions'])
        tw_info.append(date)
        tw_info.append(author)
        tw_info.append(foll)
        tw_info.append(len_tweet)
        tw_info.append(favor_ct)
        tw_info.append(user_ment_ct)
        tw_info_total.append(tw_info)
       
    df = pd.DataFrame(tw_info_total,columns=['time','author','followers','len of tweet','favorite_count','number of user_mentioned'])
    df = df.sort_values(by = 'time')
    df = df.reset_index(drop=True)
    
    return df

def load_file_test(file):
    #info of total tweets
    tw_info_total = []
    for line in open(file) :
        tw_info = []
        curr_tw = json.loads(line)
        #time current tweet is posted
        date = curr_tw['firstpost_date']
        #author of this tweet
        author = curr_tw['author']['name']
        #follower of this user
        foll = curr_tw['author']['followers']
        #length of tweet
        len_tweet = len(curr_tw['tweet']['text'])
        #favorite count
        favor_ct = curr_tw['tweet']['favorite_count']
        #user mentioned count
        user_ment_ct = len(curr_tw['tweet']['entities']['user_mentions'])
        tw_info.append(date)
        tw_info.append(author)
        tw_info.append(foll)
        tw_info.append(len_tweet)
        tw_info.append(favor_ct)
        tw_info.append(user_ment_ct)
        tw_info_total.append(tw_info)
       
    df = pd.DataFrame(tw_info_total,columns=['time','author','followers','len of tweet','favorite_count','number of user_mentioned'])
    df = df.sort_values(by = 'time')
    df = df.reset_index(drop=True)
    
    return df

def get_hour(time_stamp):
    pst_tz = pytz.timezone('US/Pacific')
    return (datetime.fromtimestamp(time_stamp, pst_tz)).hour

def avg_rmse_lr(features,labels):
    kf = KFold(n_splits=10, random_state=0, shuffle=True)

    test_mse_list = []

    for train_index, test_index in kf.split(features):
        X_train, X_test = features[train_index], features[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
    
        linearregression.fit(X_train, y_train)
        
        # Compute mse for test set
        y_test_pred = linearregression.predict(X_test)
        test_mse_list.append(mean_squared_error(y_test, y_test_pred))
    
    avg_test_rmse = np.sqrt(np.mean(test_mse_list))

    print ('RMSE for linear regression model is', avg_test_rmse)

def avg_rmse_rf(features,labels):
    kf = KFold(n_splits=10, random_state=0, shuffle=True)

    test_mse_list = []

    for train_index, test_index in kf.split(features):
        X_train, X_test = features[train_index], features[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
    
        rf_regressor.fit(X_train, y_train)
        
        # Compute mse for test set
        y_test_pred = rf_regressor.predict(X_test)
        test_mse_list.append(mean_squared_error(y_test, y_test_pred))
    
    avg_test_rmse = np.sqrt(np.mean(test_mse_list))

    print ('RMSE for Random Forest Regressor model is', avg_test_rmse)

def avg_rmse_svm(features,labels):
    kf = KFold(n_splits=10, random_state=0, shuffle=True)

    test_mse_list = []

    for train_index, test_index in kf.split(features):
        X_train, X_test = features[train_index], features[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
    
        svm.fit(X_train, y_train)
        
        # Compute mse for test set
        y_test_pred = svm.predict(X_test)
        test_mse_list.append(mean_squared_error(y_test, y_test_pred))
    
    avg_test_rmse = np.sqrt(np.mean(test_mse_list))

    print ('RMSE for LinearSVR model is', avg_test_rmse)
    
    
def get_features(df):
    tw_tot = 0
    foll_tot = 0
    author_visited_dict = dict()
    len_tweet = 0
    favorite_ct = 0
    user_ment = 0
    start_hour = get_hour(df.iloc[0,0])
    
    feature_list_tot = []
    feature_list_curr = []
    
            
    for index, row in df.iterrows():
        curr_hour = get_hour(row['time'])
        curr_author = row['author']
        curr_foll = row['followers'] 
        curr_len_tweet = row['len of tweet'] 
        curr_favorite_ct = row['favorite_count']
        curr_user_ment = row['number of user_mentioned']
        
        if curr_hour == start_hour:
            tw_tot += 1
            len_tweet += curr_len_tweet
            favorite_ct += curr_favorite_ct
            user_ment += curr_user_ment
            if (curr_author not in author_visited_dict):
                foll_tot += curr_foll
                author_visited_dict[curr_author] = True;
            
        
        else:
            feature_list_curr.append(start_hour)
            feature_list_curr.append(tw_tot)
            feature_list_curr.append(foll_tot)
            feature_list_curr.append(len_tweet/tw_tot)
            feature_list_curr.append(favorite_ct)
            feature_list_curr.append(user_ment)
            feature_list_tot.append(feature_list_curr)
        
            #setup the counters
            tw_tot = 1
            foll_tot = curr_foll
            len_tweet = curr_len_tweet
            favorite_ct = curr_favorite_ct
            user_ment = curr_user_ment
            start_hour = curr_hour
            feature_list_curr = []
            author_visited_dict.clear()
        
    feature_df = pd.DataFrame(feature_list_tot,columns=['time','tweets_total','followers_total',
                                'len of tweet(avg)','favorite_count','number of user_mentioned'])
    return feature_df


    
def cross_validation_3period_new(df):
    # split tweets
    time1 = 1422806400 #20150201 8:00am
    time2 = 1422849600 #20150201 8:00pm

    data_1 = []
    data_2 = []
    data_3 = []
    for i in range(0,len(df)):
        tweet = df.iloc[i]
        time = tweet["time"]
        if   time < time1: 
             data_1.append(tweet)
        elif time >= time1 and time < time2: 
             data_2.append(tweet)
        else: 
             data_3.append(tweet)

    df_1 = pd.DataFrame(data_1,columns=['time','author','followers',
                                             'len of tweet','favorite_count','number of user_mentioned'])
    df_1 = df_1.sort_values(by = 'time')
    df_1 = df_1.reset_index(drop=True)

    df_2 = pd.DataFrame(data_2,columns=['time','author','followers',
                                             'len of tweet','favorite_count','number of user_mentioned'])
    df_2 = df_2.sort_values(by = 'time')
    df_2 = df_2.reset_index(drop=True)
    
    df_3 = pd.DataFrame(data_3,columns=['time','author','followers',
                                             'len of tweet','favorite_count','number of user_mentioned'])
    df_3 = df_3.sort_values(by = 'time')
    df_3 = df_3.reset_index(drop=True)

    # first period
    feature_df_1 = get_features(df_1)
    y_1 = np.nan_to_num(feature_df_1['tweets_total'].values)
    feature_df_1.drop(columns =['time'],inplace = True)
    x_1 = np.nan_to_num(feature_df_1.values)
    x_1_len = x_1.shape[0]
    train_1 = x_1[0:x_1_len-1,:]
    y_1_len = len(y_1)
    target_1 = y_1[1:y_1_len]

    # rmse_1
    print "first period"
    avg_rmse_lr(train_1,target_1)
    avg_rmse_rf(train_1,target_1)
    avg_rmse_svm(train_1,target_1)
    
    # second period
    feature_df_2 = get_features(df_2)
    y_2 = np.nan_to_num(feature_df_2['tweets_total'].values)
    feature_df_2.drop(columns =['time'],inplace = True)
    x_2 = np.nan_to_num(feature_df_2.values) 
    x_2_len = x_2.shape[0]
    train_2 = x_2[0:x_2_len-1,:]
    y_2_len = len(y_2)
    target_2 = y_2[1:y_2_len]

    # rmse_2
    print "second period"
    avg_rmse_lr(train_2,target_2)
    avg_rmse_rf(train_2,target_2)
    avg_rmse_svm(train_2,target_2)
    
    
    # third period
    feature_df_3 = get_features(df_3)
    y_3 = np.nan_to_num(feature_df_3['tweets_total'].values)
    feature_df_3.drop(columns =['time'],inplace = True)
    x_3 = np.nan_to_num(feature_df_3.values)
    x_3_len = x_3.shape[0]
    train_3 = x_3[0:x_3_len-1,:]
    y_3_len = len(y_3)
    target_3 = y_3[1:y_3_len]

    # rmse_3
    print "third period"
    avg_rmse_lr(train_3,target_3)
    avg_rmse_rf(train_3,target_3)
    avg_rmse_svm(train_3,target_3)

In [3]:
path = "tweet_data/"

files = ["tweets_#gohawks.txt", "tweets_#gopatriots.txt", 
        "tweets_#nfl.txt", "tweets_#patriots.txt", 
        "tweets_#sb49.txt", "tweets_#superbowl.txt"]

for index, name in enumerate(files):
    print ("files[" + str(index) + "] => " + name)
    
    


files[0] => tweets_#gohawks.txt
files[1] => tweets_#gopatriots.txt
files[2] => tweets_#nfl.txt
files[3] => tweets_#patriots.txt
files[4] => tweets_#sb49.txt
files[5] => tweets_#superbowl.txt


In [4]:
linearregression = LinearRegression()
rf_regressor = RandomForestRegressor(n_estimators=13,
                             max_features=3,
                             max_depth=11,
                             bootstrap=True,
                             oob_score=True,
                             random_state=0)
svm = LinearSVR(random_state = 0)

# Problem 1.4 (i)

In [7]:
# hashtag tweets_#gohawks 

# load #gohawks
gohawks = load_file_new('tweet_data/tweets_#gohawks.txt')
print 'hashtag gohawks'
cross_validation_3period_new(gohawks)

hashtag gohawks
first period
('RMSE for linear regression model is', 1671.571821081425)


  warn("Some inputs do not have OOB scores. "


('RMSE for Random Forest Regressor model is', 1096.1852083005842)
('RMSE for LinearSVR model is', 1100.2600887786016)
second period
('RMSE for linear regression model is', 20266.900027210446)
('RMSE for Random Forest Regressor model is', 2769.1677585780044)
('RMSE for LinearSVR model is', 6514.048278880562)
third period
('RMSE for linear regression model is', 214.7525014340775)
('RMSE for Random Forest Regressor model is', 66.92900398846412)
('RMSE for LinearSVR model is', 383.0070664688258)


In [9]:
# hashtag tweets_#gopatriots 
# load #gopatriots
gopatriots  = load_file_new('tweet_data/tweets_#gopatriots.txt')
print 'hashtag gopatriots '
cross_validation_3period_new(gopatriots) 

hashtag gopatriots 
first period
('RMSE for linear regression model is', 60.64625904551412)
('RMSE for Random Forest Regressor model is', 61.618926891704106)
('RMSE for LinearSVR model is', 82.30775393437325)
second period
('RMSE for linear regression model is', 2217.6928222474935)
('RMSE for Random Forest Regressor model is', 1128.6984757287598)
('RMSE for LinearSVR model is', 1455.8720110671686)
third period
('RMSE for linear regression model is', 22.895030583627182)
('RMSE for Random Forest Regressor model is', 8.919631297604576)
('RMSE for LinearSVR model is', 98.13851963200935)


In [10]:
# hashtag tweets_#nfl
# load #nfl
nfl = load_file_new('tweet_data/tweets_#nfl.txt')
print 'hashtag nfl'
cross_validation_3period_new(nfl)

hashtag nfl
first period
('RMSE for linear regression model is', 318.1609315476443)
('RMSE for Random Forest Regressor model is', 284.64714221884753)
('RMSE for LinearSVR model is', 1630.519554138228)
second period
('RMSE for linear regression model is', 4326.9571395384555)
('RMSE for Random Forest Regressor model is', 3158.7069904439204)
('RMSE for LinearSVR model is', 7457.270071841277)
third period
('RMSE for linear regression model is', 156.25726821547462)
('RMSE for Random Forest Regressor model is', 166.93016726413404)
('RMSE for LinearSVR model is', 1733.4765280817935)


In [11]:
# hashtag tweets_#patriots 
# load #patriots
patriots = load_file_new('tweet_data/tweets_#patriots.txt')
print 'hashtag patriots'
cross_validation_3period_new(patriots)

hashtag patriots
first period
('RMSE for linear regression model is', 756.3088025731204)
('RMSE for Random Forest Regressor model is', 749.8751142851752)
('RMSE for LinearSVR model is', 1437.1633280867065)
second period
('RMSE for linear regression model is', 26082.288753840166)
('RMSE for Random Forest Regressor model is', 18435.08080132765)
('RMSE for LinearSVR model is', 21335.436806684756)
third period
('RMSE for linear regression model is', 319.02870954874817)
('RMSE for Random Forest Regressor model is', 153.14366527634579)
('RMSE for LinearSVR model is', 3043.9832472180296)


In [12]:
# hashtag tweets_#sb49 
# load #sb49
sb49 = load_file_new('tweet_data/tweets_#sb49.txt')
print 'hashtag sb49'
cross_validation_3period_new(sb49)  

hashtag sb49
first period
('RMSE for linear regression model is', 99.92309893343163)
('RMSE for Random Forest Regressor model is', 118.65700915657258)
('RMSE for LinearSVR model is', 743.363651130486)
second period
('RMSE for linear regression model is', 319765.74488015106)
('RMSE for Random Forest Regressor model is', 33098.93104544145)
('RMSE for LinearSVR model is', 33622.279871172745)
third period
('RMSE for linear regression model is', 1166.0111578620972)
('RMSE for Random Forest Regressor model is', 230.5987330029301)
('RMSE for LinearSVR model is', 9429.138141534404)


In [13]:
# hashtag tweets_#superbowl 
# load #superbowl

superbowl = load_file_new('tweet_data/tweets_#superbowl.txt')
print 'hashtag superbowl'
cross_validation_3period_new(superbowl) 

hashtag superbowl
first period
('RMSE for linear regression model is', 800.3889383965305)
('RMSE for Random Forest Regressor model is', 785.0638115117295)
('RMSE for LinearSVR model is', 34070.959436975594)
second period
('RMSE for linear regression model is', 627468.4500797167)
('RMSE for Random Forest Regressor model is', 68842.09401419864)
('RMSE for LinearSVR model is', 142623.71478833762)
third period
('RMSE for linear regression model is', 630.0717465460622)
('RMSE for Random Forest Regressor model is', 447.0603729858627)
('RMSE for LinearSVR model is', 5889.797743610945)


# (ii）

In [12]:
# all hashtags aggregated data

tw_info_all = []

for hashtag in files:
    for line in open(path + hashtag, 'r') :
        tw_info = []
        curr_tw = json.loads(line)
        #time current tweet is posted
        date = curr_tw['citation_date']
        #author of this tweet
        author = curr_tw['author']['name']
        #follower of this user
        foll = curr_tw['author']['followers']
        #length of tweet
        len_tweet = len(curr_tw['tweet']['text'])
        #favorite count
        favor_ct = curr_tw['tweet']['favorite_count']
        #user mentioned count
        user_ment_ct = len(curr_tw['tweet']['entities']['user_mentions'])
        tw_info.append(date)
        tw_info.append(author)
        tw_info.append(foll)
        tw_info.append(len_tweet)
        tw_info.append(favor_ct)
        tw_info.append(user_ment_ct)
        tw_info_all.append(tw_info)
    
df_all = pd.DataFrame(tw_info_all,columns=['time','author','followers',
                                             'len of tweet','favorite_count','number of user_mentioned'])
df_all = df_all.sort_values(by = 'time')
df_all = df_all.reset_index(drop=True)

print 'hashtag all hashtags'
cross_validation_3period_new(df_all) 

hashtag all hashtags
first period
('RMSE for linear regression model is', 2551.3706912759853)


  warn("Some inputs do not have OOB scores. "


('RMSE for Random Forest Regressor model is', 2685.010457186324)
('RMSE for LinearSVR model is', 18282.941331870403)
second period
('RMSE for linear regression model is', 540809.654549252)
('RMSE for Random Forest Regressor model is', 97771.44100768653)
('RMSE for LinearSVR model is', 160041.66435283766)
third period
('RMSE for linear regression model is', 1589.7368697188265)
('RMSE for Random Forest Regressor model is', 691.5662886364768)
('RMSE for LinearSVR model is', 12830.72981868729)


# Part 1.5

In [None]:
# using random forest regression

In [5]:

period_1_samples = ['test_data/sample1_period1.txt','test_data/sample4_period1.txt','test_data/sample5_period1.txt','test_data/sample8_period1.txt']
period_2_samples = ['test_data/sample2_period2.txt','test_data/sample6_period2.txt','test_data/sample9_period2.txt']
period_3_samples = ['test_data/sample3_period3.txt','test_data/sample7_period3.txt','test_data/sample10_period3.txt']

In [6]:
# load data 
tw_info_all = []

for hashtag in files:
    for line in open(path + hashtag, 'r') :
        tw_info = []
        curr_tw = json.loads(line)
        #time current tweet is posted
        date = curr_tw['firstpost_date']
        #author of this tweet
        author = curr_tw['author']['name']
        #follower of this user
        foll = curr_tw['author']['followers']
        #length of tweet
        len_tweet = len(curr_tw['tweet']['text'])
        #favorite count
        favor_ct = curr_tw['tweet']['favorite_count']
        #user mentioned count
        user_ment_ct = len(curr_tw['tweet']['entities']['user_mentions'])
        tw_info.append(date)
        tw_info.append(author)
        tw_info.append(foll)
        tw_info.append(len_tweet)
        tw_info.append(favor_ct)
        tw_info.append(user_ment_ct)
        tw_info_all.append(tw_info)
    
df_all = pd.DataFrame(tw_info_all,columns=['time','author','followers',
                                             'len of tweet','favorite_count','number of user_mentioned'])
df_all = df_all.sort_values(by = 'time')
df_all = df_all.reset_index(drop=True)

In [7]:
# split tweets
time1 = 1422806400 #20150201 8:00am
time2 = 1422849600 #20150201 8:00pm

data_1 = []
data_2 = []
data_3 = []
for i in range(0,len(df_all)):
    tweet = df_all.iloc[i]
    time = tweet["time"]
    if   time < time1: 
         data_1.append(tweet)
    elif time >= time1 and time < time2: 
         data_2.append(tweet)
    else: 
         data_3.append(tweet)

df_1 = pd.DataFrame(data_1,columns=['time','author','followers',
                                             'len of tweet','favorite_count','number of user_mentioned'])
df_1 = df_1.sort_values(by = 'time')
df_1 = df_1.reset_index(drop=True)

df_2 = pd.DataFrame(data_2,columns=['time','author','followers',
                                             'len of tweet','favorite_count','number of user_mentioned'])
df_2 = df_2.sort_values(by = 'time')
df_2 = df_2.reset_index(drop=True)
    
df_3 = pd.DataFrame(data_3,columns=['time','author','followers',
                                             'len of tweet','favorite_count','number of user_mentioned'])
df_3 = df_3.sort_values(by = 'time')
df_3 = df_3.reset_index(drop=True)

# first period
feature_df_1 = get_features(df_1)
y_1 = np.nan_to_num(feature_df_1['tweets_total'].values)
feature_df_1.drop(columns =['time'],inplace = True)
x_1 = np.nan_to_num(feature_df_1.values)
x_1_len = x_1.shape[0]
train_1 = x_1[0:x_1_len-1,:]
y_1_len = len(y_1)
target_1 = y_1[1:y_1_len]
rf_regressor_1 = rf_regressor.fit(train_1,target_1)

for sample in period_1_samples:
    df_test = load_file_test(sample)
    feature_df_test = get_features(df_test)
    feature_df_test.drop(columns =['time'],inplace = True)
    x_test = np.nan_to_num(feature_df_test.values)
    x_test_len = x_test.shape[0]
    test = x_test[0:x_test_len-1,:]
    predicted_test = rf_regressor_1.predict(test)
    print sample
    print predicted_test
    
# second period
feature_df_2 = get_features(df_2)
y_2 = np.nan_to_num(feature_df_2['tweets_total'].values)
feature_df_2.drop(columns =['time'],inplace = True)
x_2 = np.nan_to_num(feature_df_2.values)
x_2_len = x_2.shape[0]
train_2 = x_2[0:x_2_len-1,:]
y_2_len = len(y_2)
target_2 = y_2[1:y_2_len]
rf_regressor_2 = rf_regressor.fit(train_2,target_2)

for sample in period_2_samples:
    df_test = load_file_test(sample)
    feature_df_test = get_features(df_test)
    feature_df_test.drop(columns =['time'],inplace = True)
    x_test = np.nan_to_num(feature_df_test.values)
    x_test_len = x_test.shape[0]
    test = x_test[0:x_test_len-1,:]
    predicted_test = rf_regressor_2.predict(test)
    print sample
    print predicted_test

# third period
feature_df_3 = get_features(df_3)
y_3 = np.nan_to_num(feature_df_3['tweets_total'].values)
feature_df_3.drop(columns =['time'],inplace = True)
x_3 = np.nan_to_num(feature_df_3.values)
x_3_len = x_3.shape[0]
train_3 = x_3[0:x_3_len-1,:]
y_3_len = len(y_3)
target_3 = y_3[1:y_3_len]
rf_regressor_3 = rf_regressor.fit(train_3,target_3)

for sample in period_3_samples:
    df_test = load_file_test(sample)
    feature_df_test = get_features(df_test)
    feature_df_test.drop(columns =['time'],inplace = True)
    x_test = np.nan_to_num(feature_df_test.values)
    x_test_len = x_test.shape[0]
    test = x_test[0:x_test_len-1,:]
    predicted_test = rf_regressor_3.predict(test)
    print sample
    print predicted_test

  warn("Some inputs do not have OOB scores. "


test_data/sample1_period1.txt
[215.03846154 181.32564103 170.63589744 298.58974359]
test_data/sample4_period1.txt
[526.40769231 384.93205128 244.7481685  237.97530825]
test_data/sample5_period1.txt
[ 518.54807692 1393.42655678  519.68296703  250.78579882]
test_data/sample8_period1.txt
[200.05128205 154.71794872 148.91575092]
test_data/sample2_period2.txt
[133388.53846154 133388.53846154 133388.53846154 135944.        ]
test_data/sample6_period2.txt
[133388.53846154 135944.         184159.38461538 171287.30769231]
test_data/sample9_period2.txt
[133388.53846154 133388.53846154 133388.53846154 133388.53846154]
test_data/sample3_period3.txt
[ 761.46153846  867.80769231 1004.57692308  991.11965812]
test_data/sample7_period3.txt
[65.69230769 64.53846154 63.         57.23076923]
test_data/sample10_period3.txt
[57.23076923 57.23076923 57.23076923 59.07692308]
