In [1]:
import json
import pytz
import datetime
import time
import winsound
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import sqrt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
import statsmodels.api as sm

from sklearn.model_selection import cross_val_score




In [42]:

pst_tz = pytz.timezone('America/Los_Angeles')

def min_max_timestamps(s):
    mincit = 10e9
    maxcit = -1

    with open(s, encoding="utf-8") as f:
        for line in f:
            json_object = json.loads(line)
            if json_object['citation_date'] < mincit:
                mincit = json_object['citation_date']
            if json_object['citation_date'] > maxcit:
                maxcit = json_object['citation_date']
    return [mincit,maxcit]


def feature_extraction(beginstamp, endstamp, window,s):
    #############  Preprocessing #############
    # if beginstamp % window == 0:
    #     beginstamp = beginstamp
    # else:
    #     beginstamp -= beginstamp % window
    #     beginstamp = beginstamp + window
    beginstamp -= beginstamp % window

    endstamp -= endstamp % window
    #########################################
    number_of_tweets = [0 for stamp in range(beginstamp, endstamp, window)]
    number_of_retweets = [0 for stamp in range(beginstamp, endstamp, window)]
    s_number_of_followers = [0 for stamp in range(beginstamp, endstamp, window)]
    max_of_followers = [0 for stamp in range(beginstamp, endstamp, window)]
    time_of_day = [0 for stamp in range(beginstamp, endstamp, window)]
    listed_count = [0 for stamp in range(beginstamp, endstamp, window)]
    s_foll_of_orig_auth = [0 for stamp in range(beginstamp, endstamp, window)]
    ranking_score = [0 for stamp in range(beginstamp, endstamp, window)]
    num = 0
    target = [0 for stamp in range(beginstamp, endstamp, window)]

    for idx, stamp in enumerate(range(beginstamp, endstamp, window)):
        time_of_day[idx] = datetime.datetime.fromtimestamp(stamp, pst_tz).hour

    limit = int((endstamp - beginstamp) / window)

    with open(s, encoding="utf-8") as f:
        for line in f:
            # print(line)
            json_object = json.loads(line)
            stamp = json_object['citation_date']
            stamp -= stamp % window
            idx = int((stamp - beginstamp) / window)

            if idx < limit and idx >=0:

                number_of_tweets[idx] += 1
                num +=1
                number_of_retweets[idx] += json_object['metrics']['citations']['total']
                s_number_of_followers[idx] += json_object['author']['followers']
                max_of_followers[idx] = max(max_of_followers[idx], json_object['author']['followers'])
                hehe = json_object['tweet']['user']['listed_count']
                listed_count[idx] += hehe if hehe != None else 0
                s_foll_of_orig_auth[idx] += json_object['original_author']['followers']
                ranking_score[idx] += json_object['metrics']['ranking_score']
            if idx > 0 and idx <=limit:

                target[idx - 1] += 1
    # print(s)
    # print(num)
    feature_target = pd.DataFrame(
        {'number_of_tweets': number_of_tweets,
         'number_of_retweets': number_of_retweets,

         'listed_count': listed_count,
         's_foll_of_orig_auth': s_foll_of_orig_auth,
         'ranking_score': ranking_score,
         'target': target
         })
    return feature_target



def lin_regress_r(datum,testname,testwindow):


    reg_fin = LinearRegression().fit(datum[:,:-1], datum[:,-1])
    pred = reg_fin.predict(datum[:,:-1])
    rmse_trn = (mean_squared_error(datum[:, -1], pred))


    ################################  testing ##########################################################

    l = min_max_timestamps(testname)

    beginstamptest = l[0]
    endstamptest = l[1]
    testfeatures = feature_extraction(beginstamptest,endstamptest,testwindow,testname)
    testdata = testfeatures.values
    testpred = reg_fin.predict(testdata[:, :-1])

  
    actual = testdata[:, -1]
  
    testrmse = (mean_squared_error(testdata[:, -1], testpred))
    for i in range(5):
        print('%.2f'%actual[i],'&','%.2f'%testpred[i],'\\\ \hline')
        
    print()
    print(testname,'&',testrmse,'\\\ \hline')
    print('-'*35)

    return [testrmse]


In [3]:

#names = ["tweets_#gohawks.txt", "tweets_#gopatriots.txt","tweets_#nfl.txt", "tweets_#patriots.txt", "tweets_#sb49.txt", "tweets_#superbowl.txt"]
names = ['file_aggreg.txt']
frequency = 2500  # Set Frequency To 2500 Hertz
duration = 2000  # Set Duration To 1000 ms == 1 second
testnames = ['sample0_period1.txt','sample0_period2.txt','sample0_period3.txt','sample1_period1.txt','sample1_period2.txt','sample1_period3.txt','sample2_period1.txt','sample2_period2.txt','sample2_period3.txt']
# s = 'ECE219_tweet_data/tweets_#gopatriots.txt'
# s_write = 'tweets_#gopatriots.xlsx'
for s in names:
    l = min_max_timestamps(s)

    beginstamp = l[0]
    endstamp = l[1]
    
    stamp1 = int(time.mktime(datetime.datetime(2015, 2, 1, 8, 0, 0, 0, pst_tz).timetuple()))
    stamp2 = int(time.mktime(datetime.datetime(2015, 2, 1, 20, 0, 0, 0, pst_tz).timetuple()))
    
    feature1 = feature_extraction(beginstamp,stamp1,3600,s)
    feature2 = feature_extraction(stamp1,stamp2,300,s)
    feature3 = feature_extraction(stamp2,endstamp,3600,s)
    



In [43]:
    name = ['MSE score','R-squared score','Train MSE','Test MSE']
    for i in range(3):
        dk1 = feature1.values
        print(testnames[i*3+0])
        win1 = lin_regress_r(dk1,testnames[i*3+0],3600)
        print()
        dk2 = feature2.values
        print(testnames[i*3+1])
        win2 = lin_regress_r(dk2,testnames[i*3+1],300)
        print()

        dk3 = feature3.values
        print(testnames[i*3+2])
        win3 = lin_regress_r(dk3,testnames[i*3+2],3600)
        print()

    winsound.Beep(frequency, duration)

sample0_period1.txt
79.00 & 551.52 \\ \hline
94.00 & 601.67 \\ \hline
101.00 & 580.04 \\ \hline
122.00 & 665.35 \\ \hline
120.00 & 573.10 \\ \hline

sample0_period1.txt & 242201.8052171222 \\ \hline
-----------------------------------

sample0_period2.txt
3834.00 & 4304.73 \\ \hline
2258.00 & 4509.67 \\ \hline
1455.00 & 2962.63 \\ \hline
1235.00 & 1942.59 \\ \hline
1123.00 & 1956.05 \\ \hline

sample0_period2.txt & 1751840.8720342466 \\ \hline
-----------------------------------

sample0_period3.txt
48.00 & 393.50 \\ \hline
94.00 & 352.80 \\ \hline
45.00 & 382.84 \\ \hline
77.00 & 369.21 \\ \hline
87.00 & 367.96 \\ \hline

sample0_period3.txt & 92962.60184462288 \\ \hline
-----------------------------------

sample1_period1.txt
180.00 & 613.24 \\ \hline
202.00 & 566.80 \\ \hline
294.00 & 614.48 \\ \hline
555.00 & 726.85 \\ \hline
846.00 & 882.50 \\ \hline

sample1_period1.txt & 90869.38415277826 \\ \hline
-----------------------------------

sample1_period2.txt
995.00 & 1692.14 \\ \hli