In [136]:
from pandas import Series
import json
import pandas as pd
import datetime
import re
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import os
from collections import defaultdict
import statistics


def PolMedian(df):
    
    tweets = defaultdict(list)
        
    for index, row in df.iterrows():
        tweets[row.date].append(row.pol)
        
    for key in tweets:
        tweets[key] = statistics.median(tweets[key])
        
    return Series(list(tweets.values()))

def PolSum(df):

    tweets = defaultdict(int)
        
    for index, row in df.iterrows():
        tweets[row.date] += row.pol
    return Series(list(tweets.values()))


def createModels(X,y):
    np.random.seed(1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.333333)
    
    #REGRESSION MODEL
    # Create linear regression object
    regr = linear_model.LinearRegression()

    # Fit regression model to the training set
    regr.fit(X_train, y_train)

    # Apply model to the test set
    y_pred = regr.predict(X_test)

    # Model evaluation
    print("Linear Regression:")
    print("Root mean squared error = %.4f" % np.sqrt(mean_squared_error(y_test, y_pred)))
    print('R-squared = %.4f' % r2_score(y_test, y_pred))
    print("\n")         
                 
    #LASSO            
    # Create lasso regression object
    lasso = linear_model.LassoCV(cv=2, alphas=(0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.5, 1.0))

    # Fit regression model to the training set
    lasso.fit(X_train, y_train)

    # Apply model to the test set
    y_pred = lasso.predict(X_test)

    
    # Model evaluation
    print("Lasso Model:")
    print("Root mean squared error = %.4f" % np.sqrt(mean_squared_error(y_test, y_pred)))
    print('R-squared = %.4f' % r2_score(y_test, y_pred))
    print('Selected alpha = %.2f' % lasso.alpha_)
    print("\n")          
                 
             
    #RIDGE
    ridge = linear_model.RidgeCV(cv=2, alphas=(0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.5, 1.0))

    ridge.fit(X_train, y_train)

    # Apply model to the test set
    y_pred = ridge.predict(X_test)

    # Model evaluation
    print("Ridge: ")
    print("Root mean squared error = %.4f" % np.sqrt(mean_squared_error(y_test, y_pred)))
    print('R-squared = %.4f' % r2_score(y_test, y_pred))
    print('Selected alpha = %.2f' % lasso.alpha_)

    
def getDataSet(Z):
    X = pd.DataFrame([Z[0],Z[1],Z[2],Z[3]],columns = ["lag1"])
    return X
def getTarget(Z):
    return Z[1:]

def getPol(t):
    pols = []
    for tweet in t.text:
        blob = TextBlob(tweet)
        pol = blob.sentiment.polarity
        pols.append(pol)
    t['pol'] = Series(pols, index=t.index)
    
    return t

def getChange(ticker):
    cdate = ticker['Date'].values
    cdate = cdate[:]
    closing = ticker['Close']
    opening = ticker['Open']
    N = closing.size
    
    change = closing[:].values-opening[:].values
    changeData = Series(change, index=cdate)
    
    return changeData

def GetZ(ticker):
    changeData = getChange(ticker)   
    Z = (changeData - changeData.mean())/changeData.std()

    return Z

def get_tweets(f):  
    #open data file
    
    with open(f) as file:
        data = file.read()
        d = data.split("\n\n")[:-1]
        
    tweets = []
    for tweet in d:
        t = {}
        tw = json.loads(tweet)
        if("created_at" in tw.keys()):
            t["date"] = datetime.datetime.strptime(tw["created_at"], '%a %b %d %H:%M:%S +0000 %Y').strftime('%Y-%m-%d')
            t["text"] = tw["text"]
            tweets.append(t)
            
    return tweets

def filt(df, keywords):
        
    def determine(keywords, text):
        val = True
        for key in keywords:
            if(not re.search("("+key+")", text, flags=re.IGNORECASE)):
                return False
        return val

    tweets = []
    for index,row in df.iterrows():
        
        tweet=row.text
        #filter tweets
        if(determine(keywords, tweet)):
            tweets.append(row)
        
            
    return pd.DataFrame(tweets)

def PolAvg(df):
    tweets = defaultdict(list)
        
    for index, row in df.iterrows():
        tweets[row.date].append(row.pol)
        
    for key in tweets:
        tweets[key] = sum(tweets[key])/len(tweets[key])
    return Series(list(tweets.values()))

In [132]:
#get tweets
directory = os.fsencode('data')

tweets = []
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    tw = pd.DataFrame(get_tweets(filename))
    tweets.append(tw)
tweetsDF = pd.concat(tweets)
tweetsDF.head()

Unnamed: 0,date,text
0,2015-04-27,@BrightonHouse_ *****NEW**** CLOSE 2 ME REMI...
1,2015-04-27,@Jazzy_Got_it facts
2,2015-04-27,Happy 4th birthday to my sweet Riley girl!! 🎉 ...
3,2015-04-27,Woke up to some very incredible news this morn...
4,2015-04-27,Wish life had a reset button 🔘


In [133]:
NetflixTweets = filt(tweetsDF,["Netflix"])
MCDTweets = filt(tweetsDF,["Mcdonalds"])
FacebookTweets = filt(tweetsDF,["facebook"])
FacebookTweets.head()

Unnamed: 0,date,text
824,2015-04-27,Check out our website for more great prices at...
2415,2015-04-27,I don't get it @nikkiturnipseed tbh recruitmen...
2421,2015-04-27,@aditya_0911 @facebook Down for me too
2608,2015-04-27,@JustinPulitzer @johnnyt74 @facebook @jimcrame...
2823,2015-04-27,I should really head over to Facebook and see ...


In [137]:
NetflixPols = getPol(NetflixTweets)
FBPols = getPol(FacebookTweets)
FBPols.head()

Unnamed: 0,date,text,pol
824,2015-04-27,Check out our website for more great prices at...,0.65
2415,2015-04-27,I don't get it @nikkiturnipseed tbh recruitmen...,0.05
2421,2015-04-27,@aditya_0911 @facebook Down for me too,-0.155556
2608,2015-04-27,@JustinPulitzer @johnnyt74 @facebook @jimcrame...,0.5
2823,2015-04-27,I should really head over to Facebook and see ...,-0.257143


In [138]:
FB = pd.read_csv('FB.csv', header='infer' ) 
NFLX = pd.read_csv('NFLX.csv', header='infer')
MCD = pd.read_csv('MCD.csv', header='infer')

#get standardized price changes
FBZ=GetZ(FB)
NFLXZ=GetZ(NFLX)
MCDZ=GetZ(MCD)
MCDZ.head()

2015-04-27   -1.697213
2015-04-28    0.475588
2015-04-29    0.384414
2015-04-30   -0.025830
2015-05-01    0.863042
dtype: float64

# MCDONALDS

In [139]:
X = getDataSet(MCDZ)
y = getTarget(MCDZ)
createModels(X, y)

Linear Regression:
Root mean squared error = 0.4364
R-squared = 0.0360


Lasso Model:
Root mean squared error = 0.4446
R-squared = -0.0007
Selected alpha = 1.00


Ridge: 
Root mean squared error = 0.4364
R-squared = 0.0358
Selected alpha = 1.00


In [140]:
#add sum of polarity
MCDPols = getPol(MCDTweets)
X["sentiment sum"] = PolSum(MCDPols)
createModels(X, y)

Linear Regression:
Root mean squared error = 0.4364
R-squared = 0.0359


Lasso Model:
Root mean squared error = 0.4446
R-squared = -0.0007
Selected alpha = 1.00


Ridge: 
Root mean squared error = 0.4364
R-squared = 0.0357
Selected alpha = 1.00


In [141]:
X["sentiment average"] =  PolAvg(MCDPols)
createModels(X, y)
X.head()

Linear Regression:
Root mean squared error = 0.4363
R-squared = 0.0362


Lasso Model:
Root mean squared error = 0.4446
R-squared = -0.0007
Selected alpha = 1.00


Ridge: 
Root mean squared error = 0.4363
R-squared = 0.0361
Selected alpha = 1.00


Unnamed: 0,lag1,sentiment sum,sentiment average
0,-1.697213,-0.325,-0.325
1,0.475588,-0.331818,-0.165909
2,0.384414,0.225,0.028125
3,-0.02583,-0.15,-0.075


In [142]:
X["sentiment median"] =  PolMedian(MCDPols)
createModels(X, y)

Linear Regression:
Root mean squared error = 0.4363
R-squared = 0.0364


Lasso Model:
Root mean squared error = 0.4446
R-squared = -0.0007
Selected alpha = 1.00


Ridge: 
Root mean squared error = 0.4363
R-squared = 0.0363
Selected alpha = 1.00


# FACEBOOK

In [143]:
X = getDataSet(FBZ)
y = getTarget(FBZ)
createModels(X, y)

Linear Regression:
Root mean squared error = 0.9107
R-squared = -0.8784


Lasso Model:
Root mean squared error = 0.8543
R-squared = -0.6530
Selected alpha = 1.00


Ridge: 
Root mean squared error = 0.8997
R-squared = -0.8335
Selected alpha = 1.00


In [144]:
#add sum of polarity
FBPols = getPol(FacebookTweets)
FBSum = PolSum(FBPols)
X["sentiment sum"] = FBSum
createModels(X, y)

Linear Regression:
Root mean squared error = 0.8702
R-squared = -0.7150


Lasso Model:
Root mean squared error = 0.8543
R-squared = -0.6530
Selected alpha = 1.00


Ridge: 
Root mean squared error = 0.8600
R-squared = -0.6751
Selected alpha = 1.00


In [145]:
X["sentiment average"] = PolAvg(FBPols)
createModels(X, y)

Linear Regression:
Root mean squared error = 0.8752
R-squared = -0.7350


Lasso Model:
Root mean squared error = 0.8543
R-squared = -0.6530
Selected alpha = 1.00


Ridge: 
Root mean squared error = 0.8650
R-squared = -0.6948
Selected alpha = 1.00


In [146]:
X["sentiment median"] = PolMedian(FBPols)
createModels(X, y)

Linear Regression:
Root mean squared error = 0.8433
R-squared = -0.6109


Lasso Model:
Root mean squared error = 0.8543
R-squared = -0.6530
Selected alpha = 1.00


Ridge: 
Root mean squared error = 0.8334
R-squared = -0.5731
Selected alpha = 1.00


# NETFLIX

In [147]:
X = getDataSet(NFLXZ)
y = getTarget(NFLXZ)
createModels(X, y)

Linear Regression:
Root mean squared error = 1.7101
R-squared = -14.0726


Lasso Model:
Root mean squared error = 1.2386
R-squared = -6.9070
Selected alpha = 1.00


Ridge: 
Root mean squared error = 1.7038
R-squared = -13.9612
Selected alpha = 1.00


In [148]:
#add sum of polarity
NetflixPols = getPol(NetflixTweets)
NFLXSum = PolSum(NetflixPols)

X["sentiment sum"] = NFLXSum
createModels(X, y)

Linear Regression:
Root mean squared error = 1.7824
R-squared = -15.3730


Lasso Model:
Root mean squared error = 1.2386
R-squared = -6.9070
Selected alpha = 1.00


Ridge: 
Root mean squared error = 1.7757
R-squared = -15.2503
Selected alpha = 1.00


In [149]:
X["sentiment average"] = PolAvg(NetflixPols)
createModels(X, y)

Linear Regression:
Root mean squared error = 1.7820
R-squared = -15.3662


Lasso Model:
Root mean squared error = 1.2386
R-squared = -6.9070
Selected alpha = 1.00


Ridge: 
Root mean squared error = 1.7753
R-squared = -15.2437
Selected alpha = 1.00


In [150]:
X["sentiment median"] = PolMedian(NetflixPols)
createModels(X, y)

Linear Regression:
Root mean squared error = 1.7816
R-squared = -15.3576


Lasso Model:
Root mean squared error = 1.2386
R-squared = -6.9070
Selected alpha = 1.00


Ridge: 
Root mean squared error = 1.7749
R-squared = -15.2356
Selected alpha = 1.00
