In [50]:
# Data preparation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from plotnine import *
plt.style.use('ggplot')
# Modeling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
import xgboost as xgb
from sklearn.metrics import r2_score

# 1.Load & Prepare Data

**Tweets:**
* We've scraped tweets of 16 states in U.S.
* We used "small business in" + "state_name" as querying key word.
* For each state, code requested 100 tweets on daily basis and "text","date","username" were attached into dataframe.
* Text data (tweets) is scraped and preprocessed into sentimental values (percentage of positive/neutral/negative words) in notebook, "tweets_to_sentiment".

**Target:**
* Target data is from main dataset of this project.

**Join:**
* Timestamp of both dataset are wrangled into combination of year_quarter and merged on the column.

In [5]:
def get_quarter_info_target(x):
    if x == "March":
        return 1
    elif x == "June":
        return 2
    elif x == "September":
        return 3
    else:
        return 4

**Load & Merge & Prepare Final Data**

In [36]:
final_data = pd.DataFrame()
state_name_list = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
                   'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 
                   'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'New_York']
for state in state_name_list:
    # Sentiment from tweets
    state_name = str(state).replace("_", " ")
    state_sentiment = pd.read_csv("sentiment_from_tweets/%s_sentiment.csv" % state, index_col=0)
    # Target: up&down in SME
    target = pd.read_csv("target.csv")
    target.year = target.year.apply(lambda x: int(x))
    target["quarter"] = target.month.apply(lambda x: get_quarter_info_target(x))
    target = target.sort_values(["year", "quarter"])
    target["timestamp"] = target.year.apply(lambda x: str(int(x)) + "_") \
                                        + target.quarter.apply(lambda x: str(int(x)))
    target = target[(target.year > 2011) & (target.year <= 2016) & (target.state == state_name)].reset_index(drop=True)
    target = target.drop_duplicates().reset_index(drop=True)
    # Merge
    temp_data = state_sentiment.copy().reset_index(drop=True)
    temp_data = temp_data.merge(target, on="timestamp")
    # Processing merged data into change rate
    temp_data["negative_shift"] = temp_data.negative.shift(1)
    temp_data["negative_change_rate"] = (temp_data.negative - temp_data.negative_shift) / temp_data.negative_shift
    temp_data["positive_shift"] = temp_data.positive.shift(1)
    temp_data["positive_change_rate"] = (temp_data.positive - temp_data.positive_shift) / temp_data.positive_shift
    temp_data["neutral_shift"] = temp_data.neutral.shift(1)
    temp_data["neutral_change_rate"] = (temp_data.neutral - temp_data.neutral_shift) / temp_data.neutral_shift
    temp_data["total_contract_shift"] = temp_data.total_contract.shift(1)
    temp_data["total_contract_change_rate"] = (temp_data.total_contract - temp_data.total_contract_shift) / temp_data.total_contract_shift
    temp_data["total_expand_shift"] = temp_data.total_expand.shift(1)
    temp_data["total_expand_change_rate"] = (temp_data.total_expand - temp_data.total_expand_shift) / temp_data.total_expand_shift
    temp_data["net_change_shift"] = temp_data.net_change.shift(1)
    temp_data["net_change_change_rate"] = (temp_data.net_change - temp_data.net_change_shift) / temp_data.net_change_shift
    temp_data = temp_data[["state", "timestamp", "negative_change_rate", "neutral_change_rate", "positive_change_rate", 
                           "total_contract_change_rate", "total_expand_change_rate", "net_change_change_rate"]].drop(0).reset_index(drop=True)
    # append processed data into final dataset
    final_data = final_data.append(temp_data, ignore_index=True)
final_data = final_data.reset_index(drop=True)

**Split data into X, y**

In [46]:
X = final_data[["negative_change_rate", "neutral_change_rate", "positive_change_rate"]]
y_contract = final_data["total_contract_change_rate"]
y_expand = final_data["total_expand_change_rate"]
y_net_change = final_data["net_change_change_rate"]

**Split data into train, test set**

In [49]:
X_train_contract, X_test_contract, y_train_contract, y_test_contract = train_test_split(X, y_contract, test_size=0.2, random_state=42)
X_train_expand, X_test_expand, y_train_expand, y_test_expand = train_test_split(X, y_expand, test_size=0.2, random_state=42)
X_train_net_change, X_test_net_change, y_train_net_change, y_test_net_change = train_test_split(X, y_net_change, test_size=0.2, random_state=42)

# 2.Forecasting Model

## 2-1.Contract

In [51]:
cv_score_sheet_contract = pd.DataFrame(columns=["Model", "CV score"])
model_dict_contract = {}

**Linear Models**

In [53]:
# Linear Regression
linear_pipe = Pipeline(memory=None, 
                       steps=[('standardscaler', StandardScaler()), ('linearregression', LinearRegression())])
linear_param_grid = {}
linear_grid_contract = GridSearchCV(linear_pipe, param_grid=linear_param_grid, cv=10, refit=True, iid=True, scoring='r2')
linear_grid_contract.fit(X_train_contract, y_train_contract);
cv_score_sheet_contract = cv_score_sheet_contract.append({"Model": "Linear Regression", "CV score": round(linear_grid_contract.best_score_,2)}, ignore_index=True)
model_dict_contract["Linear Regression"] = linear_grid_contract

In [54]:
# Ridge
ridge_pipe = Pipeline(memory=None, 
                      steps=[('standardscaler', StandardScaler()), ('ridge', Ridge())])
ridge_param_grid = {"ridge__alpha": np.logspace(0, 3, 30)}
ridge_grid_contract = GridSearchCV(ridge_pipe, param_grid=ridge_param_grid, cv=10, refit=True, iid=True, scoring='r2')
ridge_grid_contract.fit(X_train_contract, y_train_contract);
cv_score_sheet_contract = cv_score_sheet_contract.append({"Model": "Ridge Regression", "CV score": round(ridge_grid_contract.best_score_,2)}, ignore_index=True)
model_dict_contract["Ridge Regression"] = ridge_grid_contract

In [55]:
# Lasso
lasso_pipe = Pipeline(memory=None, 
                      steps=[('standardscaler', StandardScaler()), ('lasso', Lasso())])
lasso_param_grid = {"lasso__alpha": np.logspace(0, 3, 30)}
lasso_grid_contract = GridSearchCV(lasso_pipe, param_grid=lasso_param_grid, cv=10, refit=True, iid=True, scoring='r2')
lasso_grid_contract.fit(X_train_contract, y_train_contract);
cv_score_sheet_contract = cv_score_sheet_contract.append({"Model": "Lasso Regression", "CV score": round(lasso_grid_contract.best_score_,2)}, ignore_index=True)
model_dict_contract["Lasso Regression"] = lasso_grid_contract

In [56]:
# ElasticNet
elasticnet_pipe = Pipeline(memory=None, 
                        steps=[('standardscaler', StandardScaler()), ('elasticnet', ElasticNet())])
elasticnet_param_grid = {"elasticnet__alpha": np.logspace(0, 3, 30), 
                         "elasticnet__l1_ratio": [.1, .2, .3, .4, .5, .6, .7, .8, .9]}
elasticnet_grid_contract = GridSearchCV(elasticnet_pipe, param_grid=elasticnet_param_grid, cv=10, refit=True, iid=True, scoring='r2')
elasticnet_grid_contract.fit(X_train_contract, y_train_contract);
cv_score_sheet_contract = cv_score_sheet_contract.append({"Model": "ElasticNet", "CV score": round(elasticnet_grid_contract.best_score_,2)}, ignore_index=True)
model_dict_contract["ElasticNet"] = elasticnet_grid_contract

**Tree-based Models**

In [57]:
# RandomForest
rf_pipe = Pipeline(memory=None, 
                        steps=[('rf', RandomForestRegressor(n_estimators=100))])
rf_param_grid = {"rf__max_depth": [3, 5, 7, 8]}
rf_grid_contract = GridSearchCV(rf_pipe, param_grid=rf_param_grid, cv=10, refit=True, iid=True, scoring='r2')
rf_grid_contract.fit(X_train_contract, y_train_contract);
cv_score_sheet_contract = cv_score_sheet_contract.append({"Model": "Random Forest Regressor", "CV score": round(rf_grid_contract.best_score_,2)}, ignore_index=True)
model_dict_contract["Random Forest Regressor"] = rf_grid_contract

In [58]:
# AdaBoost
ab_pipe = Pipeline(memory=None, 
                        steps=[('ab', AdaBoostRegressor(n_estimators=100))])
ab_param_grid = {"ab__learning_rate": [.05, .1, .15, .2, .25]}
ab_grid_contract = GridSearchCV(ab_pipe, param_grid=ab_param_grid, cv=10, refit=True, iid=True, scoring='r2')
ab_grid_contract.fit(X_train_contract, y_train_contract);
cv_score_sheet_contract = cv_score_sheet_contract.append({"Model": "Adaboost", "CV score": round(ab_grid_contract.best_score_,2)}, ignore_index=True)
model_dict_contract["Adaboost"] = ab_grid_contract

In [59]:
# Gradient Boost
gb_pipe = Pipeline(memory=None, 
                        steps=[('gb', GradientBoostingRegressor(n_estimators=100, subsample=0.75))])
gb_param_grid = {"gb__learning_rate": [.05, .1, .15, .2, .25], "gb__max_depth": [3, 5, 7, 8]}
gb_grid_contract = GridSearchCV(gb_pipe, param_grid=gb_param_grid, cv=10, refit=True, iid=True, scoring='r2')
gb_grid_contract.fit(X_train_contract, y_train_contract);
cv_score_sheet_contract = cv_score_sheet_contract.append({"Model": "Gradient Boosting Regressor", "CV score": round(gb_grid_contract.best_score_,2)}, ignore_index=True)
model_dict_contract["Gradient Boosting Regressor"] = gb_grid_contract

In [60]:
# XGBoost
xgb_pipe = Pipeline(memory=None, 
                        steps=[('xgb', xgb.XGBRegressor(n_estimators=100, gamma=0, subsample=0.75, colsample_bytree=1))])
xgb_param_grid = {"xgb__learning_rate": [.05, .08, .1, .15], "xgb__max_depth": [3, 5, 7, 8]}
xgb_grid_contract = GridSearchCV(xgb_pipe, param_grid=xgb_param_grid, cv=10, refit=True, iid=True, scoring='r2')
xgb_grid_contract.fit(X_train_contract, y_train_contract);
cv_score_sheet_contract = cv_score_sheet_contract.append({"Model": "XGBoost Regressor", "CV score": round(xgb_grid_contract.best_score_,2)}, ignore_index=True)
model_dict_contract["XGBoost Regressor"] = xgb_grid_contract

**Rank by CV score**

In [61]:
cv_score_sheet_contract

Unnamed: 0,Model,CV score
0,Linear Regression,-0.03
1,Linear Regression,-0.03
2,Ridge Regression,-0.02
3,Lasso Regression,-0.04
4,ElasticNet,-0.04
5,Random Forest Regressor,-0.11
6,Adaboost,-0.08
7,Gradient Boosting Regressor,-0.3
8,XGBoost Regressor,-0.28


## 2-2.Expand

In [62]:
cv_score_sheet_expand = pd.DataFrame(columns=["Model", "CV score"])
model_dict_expand = {}

**Linear Models**

In [63]:
# Linear Regression
linear_grid_expand = GridSearchCV(linear_pipe, param_grid=linear_param_grid, cv=10, refit=True, iid=True, scoring='r2')
linear_grid_expand.fit(X_train_expand, y_train_expand);
cv_score_sheet_expand = cv_score_sheet_expand.append({"Model": "Linear Regression", "CV score": round(linear_grid_expand.best_score_,2)}, ignore_index=True)
model_dict_expand["Linear Regression"] = linear_grid_expand

In [64]:
# Ridge
ridge_grid_expand = GridSearchCV(ridge_pipe, param_grid=ridge_param_grid, cv=10, refit=True, iid=True, scoring='r2')
ridge_grid_expand.fit(X_train_expand, y_train_expand);
cv_score_sheet_expand = cv_score_sheet_expand.append({"Model": "Ridge Regression", "CV score": round(ridge_grid_expand.best_score_,2)}, ignore_index=True)
model_dict_expand["Ridge Regression"] = ridge_grid_expand

In [65]:
# Lasso
lasso_grid_expand = GridSearchCV(lasso_pipe, param_grid=lasso_param_grid, cv=10, refit=True, iid=True, scoring='r2')
lasso_grid_expand.fit(X_train_expand, y_train_expand);
cv_score_sheet_expand = cv_score_sheet_expand.append({"Model": "Lasso Regression", "CV score": round(lasso_grid_expand.best_score_,2)}, ignore_index=True)
model_dict_expand["Lasso Regression"] = lasso_grid_expand

In [66]:
# ElasticNet
elasticnet_grid_expand = GridSearchCV(elasticnet_pipe, param_grid=elasticnet_param_grid, cv=10, refit=True, iid=True, scoring='r2')
elasticnet_grid_expand.fit(X_train_expand, y_train_expand);
cv_score_sheet_expand = cv_score_sheet_expand.append({"Model": "ElasticNet", "CV score": round(elasticnet_grid_expand.best_score_,2)}, ignore_index=True)
model_dict_expand["ElasticNet"] = elasticnet_grid_expand

**Tree-based Models**

In [67]:
# RandomForest
rf_grid_expand = GridSearchCV(rf_pipe, param_grid=rf_param_grid, cv=10, refit=True, iid=True, scoring='r2')
rf_grid_expand.fit(X_train_expand, y_train_expand);
cv_score_sheet_expand = cv_score_sheet_expand.append({"Model": "Random Forest Regressor", "CV score": round(rf_grid_expand.best_score_,2)}, ignore_index=True)
model_dict_expand["Random Forest Regressor"] = rf_grid_expand

In [68]:
# AdaBoost
ab_grid_expand = GridSearchCV(ab_pipe, param_grid=ab_param_grid, cv=10, refit=True, iid=True, scoring='r2')
ab_grid_expand.fit(X_train_expand, y_train_expand);
cv_score_sheet_expand = cv_score_sheet_expand.append({"Model": "Adaboost", "CV score": round(ab_grid_expand.best_score_,2)}, ignore_index=True)
model_dict_expand["Adaboost"] = ab_grid_expand

In [69]:
# Gradient Boost
gb_grid_expand = GridSearchCV(gb_pipe, param_grid=gb_param_grid, cv=10, refit=True, iid=True, scoring='r2')
gb_grid_expand.fit(X_train_expand, y_train_expand);
cv_score_sheet_expand = cv_score_sheet_expand.append({"Model": "Gradient Boosting Regressor", "CV score": round(gb_grid_expand.best_score_,2)}, ignore_index=True)
model_dict_expand["Gradient Boosting Regressor"] = gb_grid_expand

In [70]:
# XGBoost
xgb_grid_expand = GridSearchCV(xgb_pipe, param_grid=xgb_param_grid, cv=10, refit=True, iid=True, scoring='r2')
xgb_grid_expand.fit(X_train_expand, y_train_expand);
cv_score_sheet_expand = cv_score_sheet_expand.append({"Model": "XGBoost Regressor", "CV score": round(xgb_grid_expand.best_score_,2)}, ignore_index=True)
model_dict_expand["XGBoost Regressor"] = xgb_grid_expand

**Rank by CV socre**

In [71]:
cv_score_sheet_expand

Unnamed: 0,Model,CV score
0,Linear Regression,-0.14
1,Ridge Regression,-0.11
2,Lasso Regression,-0.11
3,ElasticNet,-0.11
4,Random Forest Regressor,-0.18
5,Adaboost,-0.15
6,Gradient Boosting Regressor,-0.3
7,XGBoost Regressor,-0.26


## 2-3. Net Change

In [73]:
cv_score_sheet_net_change = pd.DataFrame(columns=["Model", "CV score"])
model_dict_net_change = {}

**Linear Models**

In [74]:
# Linear Regression
linear_grid_net_change = GridSearchCV(linear_pipe, param_grid=linear_param_grid, cv=10, refit=True, iid=True, scoring='r2')
linear_grid_net_change.fit(X_train_net_change, y_train_net_change);
cv_score_sheet_net_change = cv_score_sheet_net_change.append({"Model": "Linear Regression", "CV score": round(linear_grid_net_change.best_score_,2)}, ignore_index=True)
model_dict_net_change["Linear Regression"] = linear_grid_net_change

In [75]:
# Ridge
ridge_grid_net_change = GridSearchCV(ridge_pipe, param_grid=ridge_param_grid, cv=10, refit=True, iid=True, scoring='r2')
ridge_grid_net_change.fit(X_train_net_change, y_train_net_change);
cv_score_sheet_net_change = cv_score_sheet_net_change.append({"Model": "Ridge Regression", "CV score": round(ridge_grid_net_change.best_score_,2)}, ignore_index=True)
model_dict_net_change["Ridge Regression"] = ridge_grid_net_change

In [76]:
# Lasso
lasso_grid_net_change = GridSearchCV(lasso_pipe, param_grid=lasso_param_grid, cv=10, refit=True, iid=True, scoring='r2')
lasso_grid_net_change.fit(X_train_net_change, y_train_net_change);
cv_score_sheet_net_change = cv_score_sheet_net_change.append({"Model": "Lasso Regression", "CV score": round(lasso_grid_net_change.best_score_,2)}, ignore_index=True)
model_dict_net_change["Lasso Regression"] = lasso_grid_net_change

In [77]:
# ElasticNet
elasticnet_grid_net_change = GridSearchCV(elasticnet_pipe, param_grid=elasticnet_param_grid, cv=10, refit=True, iid=True, scoring='r2')
elasticnet_grid_net_change.fit(X_train_net_change, y_train_net_change);
cv_score_sheet_net_change = cv_score_sheet_net_change.append({"Model": "ElasticNet", "CV score": round(elasticnet_grid_net_change.best_score_,2)}, ignore_index=True)
model_dict_net_change["ElasticNet"] = elasticnet_grid_net_change

**Tree-based Models**

In [78]:
# RandomForest
rf_grid_net_change = GridSearchCV(rf_pipe, param_grid=rf_param_grid, cv=10, refit=True, iid=True, scoring='r2')
rf_grid_net_change.fit(X_train_net_change, y_train_net_change);
cv_score_sheet_net_change = cv_score_sheet_net_change.append({"Model": "Random Forest Regressor", "CV score": round(rf_grid_net_change.best_score_,2)}, ignore_index=True)
model_dict_net_change["Random Forest Regressor"] = rf_grid_net_change

In [79]:
# AdaBoost
ab_grid_net_change = GridSearchCV(ab_pipe, param_grid=ab_param_grid, cv=10, refit=True, iid=True, scoring='r2')
ab_grid_net_change.fit(X_train_net_change, y_train_net_change);
cv_score_sheet_net_change = cv_score_sheet_net_change.append({"Model": "Adaboost", "CV score": round(ab_grid_net_change.best_score_,2)}, ignore_index=True)
model_dict_net_change["Adaboost"] = ab_grid_net_change

In [80]:
# Gradient Boost
gb_grid_net_change = GridSearchCV(gb_pipe, param_grid=gb_param_grid, cv=10, refit=True, iid=True, scoring='r2')
gb_grid_net_change.fit(X_train_net_change, y_train_net_change);
cv_score_sheet_net_change = cv_score_sheet_net_change.append({"Model": "Gradient Boosting Regressor", "CV score": round(gb_grid_net_change.best_score_,2)}, ignore_index=True)
model_dict_net_change["Gradient Boosting Regressor"] = gb_grid_net_change

In [81]:
# XGBoost
xgb_grid_net_change = GridSearchCV(xgb_pipe, param_grid=xgb_param_grid, cv=10, refit=True, iid=True, scoring='r2')
xgb_grid_net_change.fit(X_train_net_change, y_train_net_change);
cv_score_sheet_net_change = cv_score_sheet_net_change.append({"Model": "XGBoost Regressor", "CV score": round(xgb_grid_net_change.best_score_,2)}, ignore_index=True)
model_dict_net_change["XGBoost Regressor"] = xgb_grid_net_change

**Rank by CV Score**

In [87]:
cv_score_sheet_net_change

Unnamed: 0,Model,CV score
0,Linear Regression,-1.51
1,Ridge Regression,-1.5
2,Lasso Regression,-1.51
3,ElasticNet,-1.5
4,Random Forest Regressor,-2.22
5,Adaboost,-1.18
6,Gradient Boosting Regressor,-3.32
7,XGBoost Regressor,-5.1


# 3. Wrap-Up
* After training model forecasting target values (change rate of total contract, change rate of total expand, change rate of net change), trained models shows the negative cross-validation scores (r2 score)
* It suggests that it hard to capture trend in target values only with sentimental variables as feature.
* However, at the same time, there might comes from characteristic of states or quality of data (better key words exists).
* For the next step, we will analyze relationship between raw sentiment variable & target values.