In [9]:
import os
import json
import pandas as pd
from pandas import DataFrame
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.cross_validation import cross_val_predict
from sklearn.ensemble import RandomForestRegressor

import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [10]:
traingDict = {'#GoHawks' : ['tweets_#gohawks.txt', 188136],
                       '#GoPatriots' : ['tweets_#gopatriots.txt', 26232],
                       '#NFL' : ['tweets_#nfl.txt', 259024],
                       '#Patriots' : ['tweets_#patriots.txt', 489713],
                       '#SB49' : ['tweets_#sb49.txt', 826951],
                       '#SuperBowl' : ['tweets_#superbowl.txt', 1348767]}
    
testingDict = {1 : ['sample1_period1.txt', 730],
                      2 : ['sample2_period2.txt', 212273],
                      3 : ['sample3_period3.txt', 3628],
                      4 : ['sample4_period1.txt', 1646],
                      5 : ['sample5_period1.txt', 2059],
                      6 : ['sample6_period2.txt', 205554],
                      7 : ['sample7_period3.txt', 528],
                      8 : ['sample8_period1.txt', 229],
                      9 : ['sample9_period2.txt', 11311],
                      10 : ['sample10_period3.txt', 365]}

In [11]:

def fetch_data(filename_key, dataDict, testingData):                         
                    
   
    timeStamps = [0]*dataDict[filename_key][1]
    isRetweet = [False]*dataDict[filename_key][1]
    userFollowers = [0]*dataDict[filename_key][1]
    
    numURLCitations = [0]*dataDict[filename_key][1]
    authorNames = ['']*dataDict[filename_key][1]
    numMentions = [0]*dataDict[filename_key][1]
    rankingScores = [0.0]*dataDict[filename_key][1]
    numHashtags = [0]*dataDict[filename_key][1]
    
    filePath = ''
    postingTimeFeature = ''
    if testingData:
        filePath = './test_data/'+dataDict[filename_key][0]
        postingTimeFeature = 'firstpost_date'
    else:
        filePath = 'tweet_data/'+dataDict[filename_key][0]
        postingTimeFeature = 'citation_date'
        

    input_file = open(filePath)
    for (line, index) in zip(input_file, range(0, dataDict[filename_key][1])):
        data = json.loads(line)
        timeStamps[index] = data[postingTimeFeature]
        userFollowers[index] = data['author']['followers']

        authorName = data['author']['nick']
        originalAuthor = data['original_author']['nick']
        if authorName != originalAuthor:
            isRetweet[index] = True


        numURLCitations[index] = len(data['tweet']['entities']['urls'])
        authorNames[index] = authorName
        numMentions[index] = len(data['tweet']['entities']['user_mentions'])
        rankingScores[index] = data['metrics']['ranking_score']
        numHashtags[index] = data['title'].count('#')
        
        
    input_file.close()
    

    startTime = 1421222400
    if testingData:
        startTime = (min(timeStamps)/3600)*3600

    hours_passed = int((max(timeStamps)-startTime)/3600)+1
    hourlyTweets = [0] * hours_passed
    hourlyRetweets = [0] * hours_passed
    hourlyFollowerSum = [0] * hours_passed
    hourlyMaxFollowers = [0] * hours_passed
    hourlyTime = [0] * hours_passed

    
    hourlyURLCitations = [0] * hours_passed
    hourlyAuthors = [0] * hours_passed
    hourlyAuthorSet = [0] * hours_passed
    for i in range(0, hours_passed):
        hourlyAuthorSet[i] = set([])
    hourlyMentions = [0] * hours_passed
    hourlyRankingScores = [0.0] * hours_passed
    hourlyHashtags = [0] * hours_passed
    
    
    
    for i in range(0, dataDict[filename_key][1]):
        current_hour = int((timeStamps[i]-startTime)/3600)
        
        hourlyTweets[current_hour] += 1
        if isRetweet[i]:
            hourlyRetweets[current_hour] += 1
                                      
        hourlyFollowerSum[current_hour] += userFollowers[i]
    
        if userFollowers[i] > hourlyMaxFollowers[current_hour]:
            hourlyMaxFollowers[current_hour] = userFollowers[i]


        hourlyURLCitations[current_hour] += numURLCitations[i]
        hourlyAuthorSet[current_hour].add(authorNames[i])
        hourlyMentions[current_hour] += numMentions[i]
        hourlyRankingScores[current_hour] += rankingScores[i]
        hourlyHashtags[current_hour] += numHashtags[i]


    for i in range(0, len(hourlyAuthorSet)):
        hourlyAuthors[i] = len(hourlyAuthorSet[i])
    
    if testingData:
        for i in range(0, len(hourlyTime)):
            hourlyTime[i] = ((startTime-1421222400)/3600+i)%24
    else:
        for i in range(0, len(hourlyTime)):
            hourlyTime[i] = i%24

    
    target_value = hourlyTweets[1:]
    target_value.append(0)
    data = np.array([hourlyTweets,
                     hourlyRetweets,
                     hourlyFollowerSum,
                     hourlyMaxFollowers,
                     hourlyTime,
                     hourlyURLCitations,
                     hourlyAuthors,
                     hourlyMentions,
                     hourlyRankingScores,
                     hourlyHashtags,
                     target_value])
    data = np.transpose(data)
    df = DataFrame(data)
    df.columns = ['num_tweets', 
                  'num_retweets', 
                  'sum_followers',
                  'max_followers',
                  'timeDay',
                  'num_URLs',
                  'num_authors',
                  'num_mensions',
                  'ranking_score',
                  'num_hashtags',
                  'target_value']
    if os.path.isdir('./retrievedData'):
        pass
    else:
        os.mkdir('./retrievedData')
        
    
    if testingData:
        df.to_csv('./retrievedData/Q5_'+dataDict[filename_key][0][:-4]+'.csv', index = False)  
    else:
        df.to_csv('./retrievedData/Q5_'+filename_key+'.csv', index = False)  


In [12]:
def fetch_data(filename_key, dataDict, testingData):                         
                    
   
    timeStamps = [0]*dataDict[filename_key][1]
    isRetweet = [False]*dataDict[filename_key][1]
    userFollowers = [0]*dataDict[filename_key][1]
    
    numURLCitations = [0]*dataDict[filename_key][1]
    authorNames = ['']*dataDict[filename_key][1]
    numMentions = [0]*dataDict[filename_key][1]
    rankingScores = [0.0]*dataDict[filename_key][1]
    numHashtags = [0]*dataDict[filename_key][1]
    
    filePath = ''
    postingTimeFeature = ''
    if testingData:
        filePath = './test_data/'+dataDict[filename_key][0]
        postingTimeFeature = 'firstpost_date'
    else:
        filePath = 'tweet_data/'+dataDict[filename_key][0]
        postingTimeFeature = 'citation_date'
        

    input_file = open(filePath)
    for (line, index) in zip(input_file, range(0, dataDict[filename_key][1])):
        data = json.loads(line)
        timeStamps[index] = data[postingTimeFeature]
        userFollowers[index] = data['author']['followers']

        authorName = data['author']['nick']
        originalAuthor = data['original_author']['nick']
        if authorName != originalAuthor:
            isRetweet[index] = True


        numURLCitations[index] = len(data['tweet']['entities']['urls'])
        authorNames[index] = authorName
        numMentions[index] = len(data['tweet']['entities']['user_mentions'])
        rankingScores[index] = data['metrics']['ranking_score']
        numHashtags[index] = data['title'].count('#')
        
        
    input_file.close()
    

    startTime = 1421222400
    if testingData:
        startTime = (min(timeStamps)/3600)*3600

    hours_passed = int((max(timeStamps)-startTime)/3600)+1
    hourlyTweets = [0] * hours_passed
    hourlyRetweets = [0] * hours_passed
    hourlyFollowerSum = [0] * hours_passed
    hourlyMaxFollowers = [0] * hours_passed
    hourlyTime = [0] * hours_passed

    
    hourlyURLCitations = [0] * hours_passed
    hourlyAuthors = [0] * hours_passed
    hourlyAuthorSet = [0] * hours_passed
    for i in range(0, hours_passed):
        hourlyAuthorSet[i] = set([])
    hourlyMentions = [0] * hours_passed
    hourlyRankingScores = [0.0] * hours_passed
    hourlyHashtags = [0] * hours_passed
    
    
    
    for i in range(0, dataDict[filename_key][1]):
        current_hour = int((timeStamps[i]-startTime)/3600)
        
        hourlyTweets[current_hour] += 1
        if isRetweet[i]:
            hourlyRetweets[current_hour] += 1
                                      
        hourlyFollowerSum[current_hour] += userFollowers[i]
    
        if userFollowers[i] > hourlyMaxFollowers[current_hour]:
            hourlyMaxFollowers[current_hour] = userFollowers[i]


        hourlyURLCitations[current_hour] += numURLCitations[i]
        hourlyAuthorSet[current_hour].add(authorNames[i])
        hourlyMentions[current_hour] += numMentions[i]
        hourlyRankingScores[current_hour] += rankingScores[i]
        hourlyHashtags[current_hour] += numHashtags[i]


    for i in range(0, len(hourlyAuthorSet)):
        hourlyAuthors[i] = len(hourlyAuthorSet[i])
    
    if testingData:
        for i in range(0, len(hourlyTime)):
            hourlyTime[i] = ((startTime-1421222400)/3600+i)%24
    else:
        for i in range(0, len(hourlyTime)):
            hourlyTime[i] = i%24

    
    target_value = hourlyTweets[1:]
    target_value.append(0)
    data = np.array([hourlyTweets,
                     hourlyRetweets,
                     hourlyFollowerSum,
                     hourlyMaxFollowers,
                     hourlyTime,
                     hourlyURLCitations,
                     hourlyAuthors,
                     hourlyMentions,
                     hourlyRankingScores,
                     hourlyHashtags,
                     target_value])
    data = np.transpose(data)
    df = DataFrame(data)
    df.columns = ['num_tweets', 
                  'num_retweets', 
                  'sum_followers',
                  'max_followers',
                  'timeDay',
                  'num_URLs',
                  'num_authors',
                  'num_mensions',
                  'ranking_score',
                  'num_hashtags',
                  'target_value']
    if os.path.isdir('./retrievedData'):
        pass
    else:
        os.mkdir('./retrievedData')
        
    
    if testingData:
        df.to_csv('./retrievedData/Q5_'+dataDict[filename_key][0][:-4]+'.csv', index = False)  
    else:
        df.to_csv('./retrievedData/Q5_'+filename_key+'.csv', index = False)  


def oneHotEncoder(df):
    timeDaySet = range(0,24)
    for timeDay in timeDaySet:
        timeDayColumn = []
        for timeDay_item in df['timeDay']:
            if timeDay_item == timeDay:
                timeDayColumn.append(1)
            else:
                timeDayColumn.append(0)
        df.insert(df.shape[1]-1,
                  str(timeDay)+'th_hour',
                  timeDayColumn)
    return df



def data_regression(training_hashtag, testing_data_index):
    training_x = pd.read_csv('./retrievedData/Q5_'+training_hashtag+'.csv')
    testing_x = pd.read_csv('./retrievedData/Q5_'+testingDict[testing_data_index][0][:-4]+'.csv')
    
    training_x = oneHotEncoder(training_x)
    testing_x = oneHotEncoder(testing_x)
    

 
    training_x.drop('timeDay', 1, inplace = True)
    training_y = training_x.pop('target_value')
    
    testing_x.drop('timeDay', 1, inplace = True)
    testing_y = testing_x.pop('target_value')
    
    
    training_x_before_event = training_x[:440]
    training_x_during_event = training_x[440:452]
    training_x_after_event = training_x[452:]
        
    training_y_before_event = training_y[:440]
    training_y_during_event = training_y[440:452]
    training_y_after_event = training_y[452:]
    
        

    
    regressor_before_event = RandomForestRegressor(n_estimators = 20, max_depth = 9)
    regressor_during_event = RandomForestRegressor(n_estimators = 20, max_depth = 9)
    regressor_after_event = RandomForestRegressor(n_estimators = 20, max_depth = 9)

    regressor_before_event.fit(training_x_before_event,training_y_before_event)
    regressor_during_event.fit(training_x_during_event,training_y_during_event)
    regressor_after_event.fit(training_x_after_event,training_y_after_event)
    
    
    
    predicted_y = []
    if testingDict[testing_data_index][0][-5] == '1':
        predicted_y = regressor_before_event.predict(testing_x)
    elif testingDict[testing_data_index][0][-5] == '2':
        predicted_y = regressor_during_event.predict(testing_x)
    else:
        predicted_y = regressor_after_event.predict(testing_x)
    
    
  
    data = np.array([predicted_y, testing_y])
    data = np.transpose(data)
    results = DataFrame(data)
    results.columns = ['Predicted', 'Actual']
    #print results
    
    
    

    total_error = 0.0
    for i in range(len(testing_y)-1):
        total_error += abs(testing_y[i] - predicted_y[i])
    #print 'Average prediction error:',total_error/len(testing_y)
    
    return results, total_error/(len(testing_y)-1)






def predict(testing_data_index):
    
    result_list = []
    error_list = []

    hashtags = ['#GoHawks',
                    '#GoPatriots',
                    '#NFL',
                    '#Patriots',
                    '#SB49',
                    '#SuperBowl']
    for hashtag in hashtags:
        result, error = data_regression(hashtag, testing_data_index)
        result_list.append(result)
        error_list.append(error)
    minimum_index = 0
    minimum_error = error_list[0]
    for i in range(0, len(error_list)):
        if error_list[i] < minimum_error:
            minimum_error = error_list[i]
            minimum_index = i

    return result_list[minimum_index],minimum_error,hashtags[minimum_index]



def fetch_results(testing_data_index):
    bestResult, minimum_error, optimal_hashtag = predict(testing_data_index)
    
    for i in range(20):
        result, error, hashtag = predict(testing_data_index)
        if error  < minimum_error:
            minimum_error = error
            bestResult = result
            optimal_hashtag = hashtag
    
            
    print '-'*20
    print bestResult
    print '-'*20



def part5(): 
    fetch_data('#GoHawks',traingDict,False)
    fetch_data('#GoPatriots',traingDict,False)
    fetch_data('#NFL',traingDict,False)
    fetch_data('#Patriots',traingDict,False)
    fetch_data('#SB49',traingDict,False)
    fetch_data('#SuperBowl',traingDict,False)

    for i in range(1,11):
        fetch_data(i,testingDict,True)

    for i in range(1,11):
        fetch_results(i)

In [13]:
part5()

--------------------
    Predicted  Actual
0  119.122440    82.0
1   77.068261    68.0
2   93.081802    94.0
3   94.100586   171.0
4  158.796791   178.0
5  207.198387     0.0
--------------------
--------------------
   Predicted   Actual
0   20354.10   9361.0
1   19674.25  10374.0
2   25444.55  20066.0
3   83508.45  81958.0
4   89970.10  82923.0
5  116255.75      0.0
--------------------
--------------------
    Predicted  Actual
0  474.263333   550.0
1  508.108333   610.0
2  571.908333   888.0
3  650.575000   616.0
4  521.813333   523.0
5  468.463333     0.0
--------------------
--------------------
    Predicted  Actual
0  322.781043   257.0
1  230.964448   236.0
2  267.207538   266.0
3  243.025395   267.0
4  253.053030   201.0
5  197.633562     0.0
--------------------
--------------------
    Predicted  Actual
0  408.951316   508.0
1  426.351316   353.0
2  398.851316   362.0
3  408.951316   281.0
4  224.101316   213.0
5  103.101316     0.0
--------------------
--------------------