In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import requests
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf 
import warnings
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

warnings.filterwarnings('ignore')
%matplotlib inline
sns.set();

### Import Data

In [None]:
def import_data(filename):
    df = pd.read_csv(filename)
    return df

filename = 'KickStarter_Luther.csv'
df = import_data(filename)
df.sample(2, random_state = 42)

### Partition Data Based on Different Project Status

In [None]:
#KickStarter project is either success or live or fail
df_success = df.loc[df.Status == 'success']
df_fail = df.loc[df.Status == 'fail']
df_live = df.loc[df.Status == 'live'] 

### Explore Dataset

In [None]:
df_success.describe()
len(set(df_success.Location))

In [None]:
sns.set_context('paper')
sns.pairplot(df_success)

In [None]:
sns.set_context('poster')
sns.heatmap(df_success.corr(), square=True, cmap='RdYlGn')

Looks like there is a slight positive correlation between 
<br>- Number_of_Backer vs. Total_Pledged, </br>
<br>- Goal vs. Total_Pledged, </br>
<br>- and potentially some relationship between Total_Pledged vs.Goal</br>

In [None]:
#there are projects that has 253 different pledge options? There are.
df_success.loc[df_success.Number_of_Pledge_Options >= 200];

In [None]:
#Understand more about Location and Category cols before factorizing them into features.
len(df_success.Category.unique())

#### Need to get the parent location (e.g. New York-NY --> NY)

In [None]:
def get_parent_location(col):
    """get parent location of the location columns in KickStarter dataset"""
    par_location = []
    
    for location in col.tolist():
        par_location.append(location.split('-')[-1])
    
    print('length of strip location: %s' % len(par_location))
    print('length of df location column: %s' % len(col))
    
    df_success['Par_Location'] = par_location
    return df_success

In [None]:
get_parent_location(df_success.Location).sample(1, random_state = 1)

In [None]:
#print(df_success.Category.unique());
print('length is: ' + str(len(df_success.Category.unique())));

#### The web scraped dataset contains categroy enlisted as sub-category rather than the main category, this follow funcion and dictionary were created to convert them into main category, so that features are minimized for further modeling.

In [None]:
Cat_dict = {'Art':['Ceramics', 'Conceptual Art', 'Digital Art', 'Illustration', 'Installations', 
       'Mixed Media', 'Painting', 'Performance Art', 'Public Art', 'Sculpture', 'Textiles', 'Video Art'], 
                 'Comics':['Anthologies', 'Comic Books', 'Events', 'Graphic Novels', 'Webcomics'],
                 'Crafts':['Candles', 'Crochet', 'DIY', 'Embroidery','Glass', 'Knitting', 'Pottery',
          'Printing', 'Quilts', 'Stationery', 'Taxidermy', 'Weaving', 'Woodworking'],
                 'Dance':['Performances', 'Residencies', 'Spaces', 'Workshops'],
                 'Design':['Architecture', 'Civic Design', 'Graphic Design', 'Interactive Design', 
                           'Product Design', 'Typography'],
                 'Fashion':['Accessories', 'Apparel', 'Childrenswear' , 'Couture', 
                            'Footwear', 'Jewelry', 'Pet Fashion', 'Ready-to-wear'],
                 'Action':['Action', 'Animation', 'Comedy', 'Documentary', 'Drama', 'Experimental', 
                           'Family', 'Fantasy', 'Festivals', 'Horror', 'Movie Theaters', 'Music Videos', 
                           'Narrative Film', 'Romance', 'Science Fiction', 'Shorts', 'Television', 
                           'Thrillers', 'Webseries'],
                 'Food':['Bacon', 'Community Gardens', 'Cookbooks', 'Drinks', 'Events', "Farmer's Markets", 
                         'Farms', 'Food Trucks', 'Restaurants', 'Small Batch', 'Spaces', 'Vegan'],
                 'Games':['Gaming Hardware', 'Live Games', 'Mobile Games', 'Playing Cards', 'Puzzles', 
                          'Tabletop Games', 'Video Games'],
                 'Journalism':['Audio', 'Photo', 'Print', 'Video'],
                 'Music':['Blues', 'Chiptune', 'Classical Music', 'Comedy', 'Country & Folk', 
                          'Electronic Music','Faith', 'Hip-Hop', 'Indie Rock', 'Jazz', 'Kids', 
                          'Latin', 'Metal', 'Pop', 'Punk', 'R&B', 'Rock', 'World Music'],
                 'Photography':['Animals', 'Fine Art', 'Nature', 'People', 'Photobooks', 'Places'],
                 'Publishing':['Academic', 'Anthologies', 'Art Books', 'Calendars', "Children's Books", 
                               'Comedy', 'Fiction', 'Letterpress', 'Literary Journals', 
                               'Nonfiction', 'Periodicals', 'Poetry', 'Radio & Podcasts',
                               'Translations', 'Young Adult', 'Zines', 'Literary Spaces'],
                 'Technology':['3D Printing', 'Apps', 'Camera Equipment', 'DIY Electronics', 
                                 'Fabrication Tools', 'Flight', 'Gadgets', 'Hardware', 'Makerspaces', 
                                 'Robots', 'Software', 'Sound', 'Space Exploration', 'Wearables', 'Web'],
                 'Theater':['Comedy', 'Experimental', 'Festivals', 'Immersive', 'Musical', 'Plays', 'Spaces']
                }

def invert_dict(d):
    return dict((v,k) for k in d for v in d[k])

Category_dict = invert_dict(Cat_dict)

In [None]:
def map_main_category(df, dict_):
    df['Main_Category'] = df.Category.map(dict_)
    return df

map_main_category(df_success, Category_dict)
df_success.fillna(0, inplace = True)
df_success.sample(3, random_state = 42);

In [None]:
#reset all the 0s in the Main_Category to be the same as what Category is
df_success.Main_Category[df_success['Main_Category'] == 0] = df_success.Category;
len(df_success.Main_Category.unique())

#### With all these success projects, how much more did they pledge comparing to their goals?

In [None]:
#With all these success projects, how much more did they pledge comparing to their goals?
def get_pledge_difference(df, pledged, goal):
    df['Pledged_Difference'] = df[pledged] - df[goal]
    return df

get_pledge_difference(df_success, 'Total_Pledged', 'Goal').head(2);

In [None]:
df_success.loc[df_success.Pledged_Difference > 10000000];
#14517 projects exceeded their goals by $500
#4591 projects exceeded their goals by $5000
#1053 projects exceeded their goals by $50000
#120 projects exceeded their goals by $5000000
#3 projects exceeded their goals by $50000000 
#1 project that pledged over 10 mil

In [None]:
def get_average_pledgeamount_per_project(df, pledged, numberofbacker):
    df['Average_Pledge_Amount_byP'] = df[pledged] / df[numberofbacker]
    return df

#get_average_pledgeamount_per_project(df_success, 'Total_Pledged', 'Number_of_Backer').sample(3, random_state = 4)

In [None]:
def reorder_col(df, colnames):
    df = df[colnames]
    return df

colnames = ['Project_Name', 'Status', ' Inventor', 'Number_of_Backer', 'Total_Pledged', 'Goal', 
            'Par_Location', 'Location', 'Main_Category', 'Category', 'Number_of_Pledge_Options',
            'Pledge_Detail', 'Pledged_Difference']
df_success.rename(columns={' Inventor':'Inventor'})
reorder_col(df_success, colnames).sample(1, random_state = 42)

### Groupby some colnums to visualize the dataset more

In [None]:
def graph_groupby_barplot(df, coltogroupbyon, coltodoaggon, figurex, figurey, numberoftoprows):
    """returns a barplot with gorupby and aggregated columns and plot them based on user defined max number of rows."""
    temp_df = pd.DataFrame(df.groupby([coltogroupbyon], as_index=False)\
                         [coltodoaggon].sum())
    temp_df.sort_values([coltodoaggon], ascending=False, inplace = True)
    graph = temp_df.head(numberoftoprows).plot(x = coltogroupbyon, y =coltodoaggon, kind = 'bar', figsize = (figurex, figurey))
    return graph

In [None]:
sns.set_context('poster')
graph_groupby_barplot(df_success, 'Par_Location', 'Pledged_Difference', 30, 10, 40)
plt.xlabel('Location', fontsize = 25);
plt.ylabel('Pledged $ Differences (Total Pledged - Goal)', fontsize = 25);
plt.title('Pledged Difference by Location (Top 40)', fontsize = 28);

In [None]:
Location_by_AveragePledged = df_success.groupby(['Par_Location'], as_index=False)['Number_of_Backer', 'Total_Pledged'].sum()
Location_by_AveragePledged['Amount_Pledged_by_Par_Location'] = Location_by_AveragePledged.Total_Pledged\
                                                                    - Location_by_AveragePledged.Number_of_Backer
    
Location_by_AveragePledged.sort_values(['Amount_Pledged_by_Par_Location'], ascending=False, inplace = True)
Location_by_AveragePledged.head(10).plot(x = 'Par_Location', y = 'Amount_Pledged_by_Par_Location', kind = 'bar', figsize = (30, 10))
plt.xlabel('Location', fontsize = 25);
plt.ylabel('Amount $ Pledged/Project by Location', fontsize = 25);
plt.title('Average Amount $ Pledged/Project by Location (Top 40)', fontsize = 28);

In [None]:
graph_groupby_barplot(df_success, 'Main_Category', 'Number_of_Backer', 30, 10, 40)
plt.xlabel('Category', fontsize = 25);
plt.ylabel('Number of Backers', fontsize = 25);
plt.title('Number of Backers by Category (Top 40)', fontsize = 28);

### Examine distribution of the Target Feature

In [None]:
sns.distplot(df_success['Total_Pledged'], 
             bins = 100,
             kde_kws = {'color': '#e5ae38', 'label': 'KDE', 'clip':(0,50000)},
             hist_kws = {'alpha': 0.25, 
                         'label': 'Total_Pledged', 
                         'edgecolor':'b', 
                         'linewidth':1, 
                         'color':'#4dd2ff',
                         'range':[0,50000]},)

### BASELINE MODEL (V1)

Feature using (X):
- Number of Backer
- Goal
- Number of Pledged Option

Feature predicting (Y):
- Total Pledged

In [None]:
baseline_dfs = df_success.copy()
baseline_dfs.drop(columns=['Pledge_Detail', 'Location', 'Status', 'Category']);

In [None]:
baseline_dfs_col = ['Number_of_Backer', 'Goal', 'Number_of_Pledge_Options', 'Total_Pledged']
baseline_dfs = reorder_col(baseline_dfs, baseline_dfs_col)
baseline_dfs.sample(1, random_state = 23)

In [None]:
b_X = baseline_dfs.drop(columns=['Total_Pledged'])
b_Y = baseline_dfs.Total_Pledged
b_X, b_Y = np.array(b_X), np.array(b_Y)

#split test and train dataset, hold out 20% of the data for final testing
b_X_train, b_X_test, b_Y_train, b_Y_test = train_test_split(b_X, b_Y, test_size=0.2,random_state=2173)

kf = KFold(n_splits=5, shuffle=True, random_state = 47)

lm_baseline = [] #collect the validation results for base models

for train_ind, val_ind in kf.split(b_X_train, b_Y_train):
    
    X_train, Y_train = b_X_train[train_ind], b_Y_train[train_ind]
    X_val, Y_val = b_X_train[val_ind], b_Y_train[val_ind] 
    
    #simple linear regression
    lm_b = LinearRegression()

    lm_b.fit(X_train, Y_train)
    lm_baseline.append(lm_b.score(X_val, Y_val))


#Examine linear regression coefficient
print('baseline linear regression model coefficient: %f' % lm_b.intercept_)
print(set(zip((baseline_dfs.drop(columns=['Total_Pledged']).columns), lm_b.coef_)))
print('Simple regression scores: %s' % lm_baseline)
print('Simple mean cv r^2: %.3f +- %.3f' % (np.mean(lm_baseline),np.std(lm_baseline)))

In [None]:
lsm = smf.ols('Total_Pledged ~ Number_of_Backer + Goal + Number_of_Pledge_Options', data = df_success)
fit1 = lsm.fit()
print(fit1.summary())

In [None]:
#residual plot, funnel shape indicate collinearity
#maybe possible to do transformation on the dataset
sns.residplot(x = baseline_pred, y = b_Y_test-baseline_pred, data = df_success, scatter_kws={'alpha':0.2})
plt.xlabel('Predicted Value')
plt.ylabel('Residual')
plt.title('Residual Plot for Baseline Model')

In [None]:
baseline_pred = lm_b.predict(b_X_test)
sns.jointplot(baseline_pred, b_Y_test, kind='regplot', size = 8)
sns.set_context('talk')
print("Linear Regression Baseline:", r2_score(b_Y_test, baseline_pred))

### LINEAR REGRESSION MODEL WITH ADDITIONAL CATEGORICAL FEATUERS (V2)

Feature using (X):
- Number of Backer
- Goal
- Number of Pledged Option
- Parent Location (convert using get_dummies)
- Main Category (convert using get_dummies)

Feature predicting (Y):
- Total Pledged

In [None]:
add_dummies_dfs = df_success.copy()
add_dummies_dfs.drop(columns=['Project_Name', 'Pledge_Detail', 'Status', 
                              ' Inventor', 'Pledged_Difference'], inplace = True);

In [None]:
add_dummies_dfs_col = ['Par_Location', 'Main_Category', 'Number_of_Backer', 
                       'Goal', 'Number_of_Pledge_Options', 'Total_Pledged']
add_dummies_dfs = reorder_col(add_dummies_dfs, add_dummies_dfs_col)
add_dummies_dfs.rename(columns={' Inventor':'Inventor', 
                                'Par_Location':'Location', 
                                'Main_Category':'Category'}, inplace = True)
add_dummies_dfs.sample(1, random_state = 23)

In [None]:
#create dummy variable for Location and Category col
add_dummies_dfs = pd.get_dummies(data = add_dummies_dfs, columns = ['Location', 'Category'])
add_dummies_dfs.shape

In [None]:
b2_X = add_dummies_dfs.drop(columns=['Total_Pledged'])
b2_Y = add_dummies_dfs.Total_Pledged
b2_X, b2_Y = np.array(b2_X), np.array(b2_Y)

#split test and train dataset, hold out 20% of the data for final testing
b2_X_train, b2_X_test, b2_Y_train, b2_Y_test = train_test_split(b2_X, b2_Y, 
                                                                test_size=0.2,
                                                                random_state=99)

#### Simple Regression Model (5-fold CV)

In [None]:
lm_v2_r2s = [] #collect the validation results for base models

for train_ind, val_ind in kf.split(b2_X_train, b2_Y_train):
    
    X_train, Y_train = b2_X_train[train_ind], b2_Y_train[train_ind]
    X_val, Y_val = b2_X_train[val_ind], b2_Y_train[val_ind] 
    
    #simple linear regression
    lm_v2 = LinearRegression()

    lm_v2.fit(X_train, Y_train)
    lm_v2_r2s.append(lm_v2.score(X_val, Y_val))


#Examine linear regression coefficient
#print('(Dummy Variable Added) Simple regression model coefficient: %f' % lm_v2.intercept_)
#print(set(zip((add_dummies_dfs.drop(columns=['Total_Pledged']).columns), lm_v2.coef_)))
print('(Dummy Variable Added) Simple regression scores: %s' % lm_v2_r2s)
print('Simple mean cv r^2: %.3f +- %.3f' % (np.mean(lm_v2_r2s),np.std(lm_v2_r2s)))

#### - Simple Regression is not a good model, depending on the assigned random_state value, the model score can change dramatically, ranging from a well fit of 0.65 to -4000k.

#### Lasso, Ridge, and ElasticNet Model (5-fold CV)

In [None]:
cv_lasso_r2s, cv_ridge_r2s, cv_EN_r2s = [], [], [] #collect the validation results for both models

for train_ind, val_ind in kf.split(b2_X_train, b2_Y_train):
    
    X_train, y_train = b2_X_train[train_ind], b2_Y_train[train_ind]
    X_val, y_val = b2_X_train[val_ind], b2_Y_train[val_ind] 
    
    #feature scaling for lasso, ridge, EN
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    b2_X_test 
    
    #Ridge regression
    ridge_v2= Ridge(alpha = ridge_v2_grid_srh.best_params_['alpha'])
    ridge_v2.fit(X_train_scaled, y_train)
    cv_ridge_r2s.append(ridge_v2.score(X_val_scaled, y_val))
    
    #lasso regression
    lasso_v2= Lasso(alpha = lasso_v2_grid_srh.best_params_['alpha'])
    lasso_v2.fit(X_train_scaled, y_train)
    cv_lasso_r2s.append(lasso_v2.score(X_val_scaled, y_val))
    
    #EN regression
    EN_v2= ElasticNet(alpha = EN_v2_grid_srh.best_params_['alpha'])
    EN_v2.fit(X_train_scaled, y_train)
    cv_EN_r2s.append(EN_v2.score(X_val_scaled, y_val))

    
print('Ridge scores: ', cv_ridge_r2s, '\n')
print('Ridge mean cv r^2: %.3f +- %.3f' %(np.mean(cv_ridge_r2s),np.std(cv_ridge_r2s)))
print('Lasso scores: ', cv_lasso_r2s, '\n')
print('Lasso mean cv r^2: %.3f +- %.3f' %(np.mean(cv_lasso_r2s),np.std(cv_lasso_r2s)))
print('EN scores: ', cv_EN_r2s)
print('EN mean cv r^2: %.3f +- %.3f' %(np.mean(cv_EN_r2s),np.std(cv_EN_r2s)))

In [None]:
#use grid search to search for best alpha parameter for lasso, ridge, and EN
def build_grid_search_est(model, X, y, cv=5, **params):
    grid_est = GridSearchCV(model, param_grid=params, cv=cv)
    grid_est.fit(X, y)
    df = pd.DataFrame(grid_est.grid_scores_)
    for param in params:
        df[param] = df.parameters.apply(lambda val: val[param])
        plt.semilogx(df.alpha, df.mean_validation_score)
    grid_est.grid_scores_
    return grid_est

In [None]:
print("Lasso_v2 Grid Search")
lasso_v2_grid_srh = build_grid_search_est(Lasso(), 
                                          b2_X_train, 
                                          b2_Y_train, 
                                          cv=kf, 
                                          alpha=np.logspace(1, 3, 20))

In [None]:
print("ridge_v2 Grid Search")
ridge_v2_grid_srh = build_grid_search_est(Ridge(), 
                                          b2_X_train, 
                                          b2_Y_train, 
                                          cv=kf, 
                                          alpha=np.logspace(2, 4, 20))

In [None]:
print("ElasticNet_v2 Grid Search")
EN_v2_grid_srh = build_grid_search_est(ElasticNet(), 
                                       b2_X_train, 
                                       b2_Y_train, 
                                       cv=kf, 
                                       alpha=np.logspace(-2.5, 0.5, 20))

In [None]:
#scale the x_test for model eval
scaler = StandardScaler()
b2_X_test_scaled = scaler.fit_transform(b2_X_test)

lm_v2_pred = lm_v2.predict(b2_X_test_scaled)
print("Linear Regression:", r2_score(b2_Y_test, lm_v2_pred))

lasso_v2_pred = lasso_v2.predict(b2_X_test_scaled)
print("Lasso Regression:", r2_score(b2_Y_test, lasso_v2_pred))

ridge_v2_pred = ridge_v2.predict(b2_X_test_scaled)
print("Ridge Regression:", r2_score(b2_Y_test, ridge_v2_pred))

EN_v2_pred = EN_v2.predict(b2_X_test_scaled)
print("ElasticNet Regression:", r2_score(b2_Y_test, EN_v2_pred))

In [None]:
#checking what parameter is elminated from regulazation

models = {}

models['ridgev2'] = ridge_v2
models['lassov2'] = lasso_v2
models['ENv3'] = EN_v2


for name,model in models.items():
    model.fit(b2_X_train,b2_Y_train)
    print('Model: ' + name)
    print("Score: " + str(model.score(b2_X_train,b2_Y_train)))
    sorted_features = sorted(zip(add_dummies_dfs.columns,model.coef_), key=lambda tup: abs(tup[1]), reverse=True)
    for feature in sorted_features:
        print(feature)
        
    print("")

### WORD VECTORIZATION OF PROJECT NAMES AS ADDITIONAL FEATURES (V3) 

Feature using (X):
- Number of Backer
- Goal
- Number of Pledged Option
- Parent Location (convert using get_dummies)
- Main Category (convert using get_dummies)
- Project Name (Vectorized)
- Inventor Name (Vectorized)

Feature predicting (Y):
- Total Pledged

In [None]:
pre_vec_df = df_success.copy()
pre_vec_df.Project_Name = pre_vec_df.Project_Name.str.lower()
pre_vec_df.head(1)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

transformer = TfidfTransformer(smooth_idf=False)
vectorizer = TfidfVectorizer(stop_words=['a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from',
                                         'has', 'he', 'in', 'is', 'its', 'it', 'of', 'on', 'that', 'the',
                                         'to', 'was', 'were', 'will', 'with', 'she', 'mm', 'off'], min_df = 2, max_features = 1000)

#### replace all the non-alphabatic characters in the project names to nothing, and drop all non-relevant cols

In [None]:
pre_vec_df['Project_Name'].replace(r'([^a-z\s])', '', regex=True, inplace=True)
pre_vec_df.drop(columns = ['Location', 'Category', 'Pledge_Detail', 'Status', ' Inventor', 'Pledged_Difference'], inplace = True)

In [None]:
pre_vec_df.shape

In [None]:
project_tfidf_vec = vectorizer.fit_transform(pre_vec_df.Project_Name).toarray()
project_tfidf_df = pd.DataFrame(project_tfidf_vec, columns=list(vectorizer.vocabulary_.keys()))
project_tfidf_df.shape

In [None]:
frame = [pre_vec_df, project_tfidf_df]
project_vec_df = pd.concat(frame, axis = 1)
project_vec_df.rename(columns={'Par_Location':'Location', 'Main_Category':'Category'}, inplace=True)

In [None]:
#NLP without dummy
project_vec_df_nodummy = project_vec_df.copy()
project_vec_df_nodummy.drop(columns=['Project_Name', 'Location', 'Category'], inplace = True, axis = 1)
project_vec_df_nodummy.fillna(0, inplace = True)
project_vec_df_nodummy.shape

##### now that we have vectorize the project name to top 2000 columns, lets also create the dummy variable for the Location and Category cols

In [None]:
#NLP with dummy
project_vec_df = pd.get_dummies(data = project_vec_df, columns = ['Location', 'Category'])
project_vec_df.drop(columns=['Project_Name'], inplace = True, axis = 1)
project_vec_df.fillna(0, inplace = True)

In [None]:
project_vec_df.shape

In [None]:
#split for NLP with dummy
b3_X = project_vec_df.drop(columns=['Total_Pledged'])
b3_Y = project_vec_df.Total_Pledged
b3_X, b3_Y = np.array(b3_X), np.array(b3_Y)

#split test and train dataset, hold out 20% of the data for final testing
b3_X_train, b3_X_test, b3_Y_train, b3_Y_test = train_test_split(b3_X, b3_Y, 
                                                                test_size=0.2,
                                                                random_state=19)

In [None]:
#split for NLP without dummy
b4_X = project_vec_df_nodummy.drop(columns=['Total_Pledged'])
b4_Y = project_vec_df_nodummy.Total_Pledged
b4_X, b4_Y = np.array(b4_X), np.array(b4_Y)

#split test and train dataset, hold out 20% of the data for final testing
b4_X_train, b4_X_test, b4_Y_train, b4_Y_test = train_test_split(b4_X, b4_Y, 
                                                                test_size=0.2,
                                                                random_state=19)

#### Lasso, Ridge, and EN model

#### GridSearch to find best shrinking parameter

In [None]:
print("ridge_v3 Grid Search")
ridge_v3_grid_srh = build_grid_search_est(Ridge(), 
                                          b3_X_train, 
                                          b3_Y_train, 
                                          cv=kf, 
                                          alpha=np.logspace(-1, 3, 20));

In [None]:
print("EN_v3 Grid Search")
EN_v3_grid_srh = build_grid_search_est(ElasticNet(), 
                                          b3_X_train, 
                                          b3_Y_train, 
                                          cv=kf, 
                                          alpha=np.logspace(-3, 1, 20));

In [None]:
print("lasso_v3 Grid Search")
lasso_v3_grid_srh = build_grid_search_est(Lasso(), 
                                          b3_X_train, 
                                          b3_Y_train, 
                                          cv=kf, 
                                          alpha=np.logspace(-1, 3, 20));

In [None]:
print("ridge_v4 Grid Search")
ridge_v4_grid_srh = build_grid_search_est(Ridge(), 
                                          b4_X_train, 
                                          b4_Y_train, 
                                          cv=kf, 
                                          alpha=np.logspace(-1, 3, 20));

In [None]:
print("lasso_v4 Grid Search")
lasso_v4_grid_srh = build_grid_search_est(Lasso(), 
                                          b4_X_train, 
                                          b4_Y_train, 
                                          cv=kf, 
                                          alpha=np.logspace(-1, 3, 20));

In [None]:
print("EN_v4 Grid Search")
EN_v4_grid_srh = build_grid_search_est(ElasticNet(), 
                                          b4_X_train, 
                                          b4_Y_train, 
                                          cv=kf, 
                                          alpha=np.logspace(-3, 1, 20));

In [None]:
print('best ElasticNet_dummy alpha: %.3f' % EN_v3_grid_srh.best_params_['alpha'])
print('best Lasso alpha_dummy: %.3f' % lasso_v3_grid_srh.best_params_['alpha'])
print('best Ridge alpha_dummy: %.3f' % ridge_v3_grid_srh.best_params_['alpha'])
print('best ElasticNet_nodummy alpha: %.3f' % EN_v4_grid_srh.best_params_['alpha'])
print('best Lasso alpha_nodummy: %.3f' % lasso_v4_grid_srh.best_params_['alpha'])
print('best Ridge alpha_nodummy: %.3f' % ridge_v4_grid_srh.best_params_['alpha'])

In [None]:
#fit with dummy variable df
cv_lasso3_r2s, cv_ridge3_r2s, cv_EN3_r2s = [], [], [] #collect the validation results for both models

for train_ind, val_ind in kf.split(b3_X_train, b3_Y_train):
    
    X_train, y_train = b3_X_train[train_ind], b3_Y_train[train_ind]
    X_val, y_val = b3_X_train[val_ind], b3_Y_train[val_ind] 
    
    #feature scaling for lasso, ridge, EN
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    #Ridge regression
    ridge_v3= Ridge(alpha = ridge_v3_grid_srh.best_params_['alpha'])
    ridge_v3.fit(X_train_scaled, y_train)
    cv_ridge3_r2s.append(ridge_v3.score(X_val_scaled, y_val))
    
    #lasso regression
    lasso_v3= Lasso(alpha = lasso_v3_grid_srh.best_params_['alpha'])
    lasso_v3.fit(X_train_scaled, y_train)
    cv_lasso3_r2s.append(lasso_v3.score(X_val_scaled, y_val))
    
    #EN regression
    EN_v3= ElasticNet(alpha = EN_v3_grid_srh.best_params_['alpha'])
    EN_v3.fit(X_train_scaled, y_train)
    cv_EN3_r2s.append(EN_v3.score(X_val_scaled, y_val))

In [None]:
#fit without dummy variable df
cv_lasso4_r2s, cv_ridge4_r2s, cv_EN4_r2s = [], [], [] #collect the validation results for both models

for train_ind, val_ind in kf.split(b4_X_train, b4_Y_train):
    
    X_train, y_train = b4_X_train[train_ind], b4_Y_train[train_ind]
    X_val, y_val = b4_X_train[val_ind], b4_Y_train[val_ind] 
    
    #feature scaling for lasso, ridge, EN
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    #Ridge regression
    ridge_v4= Ridge(alpha = ridge_v4_grid_srh.best_params_['alpha'])
    ridge_v4.fit(X_train_scaled, y_train)
    cv_ridge4_r2s.append(ridge_v4.score(X_val_scaled, y_val))
    
    #lasso regression
    lasso_v4= Lasso(alpha = lasso_v4_grid_srh.best_params_['alpha'])
    lasso_v4.fit(X_train_scaled, y_train)
    cv_lasso4_r2s.append(lasso_v4.score(X_val_scaled, y_val))
    
    #EN regression
    EN_v4= ElasticNet(alpha = EN_v4_grid_srh.best_params_['alpha'])
    EN_v4.fit(X_train_scaled, y_train)
    cv_EN4_r2s.append(EN_v4.score(X_val_scaled, y_val))

In [None]:
print('Ridge_dummy scores: ', cv_ridge3_r2s, '\n')
print('Ridge_dummy mean cv r^2: %.3f +- %.3f' %(np.mean(cv_ridge3_r2s),np.std(cv_ridge3_r2s)))
print('Lasso_dummy scores: ', cv_lasso3_r2s, '\n')
print('Lasso_dummy mean cv r^2: %.3f +- %.3f' %(np.mean(cv_lasso3_r2s),np.std(cv_lasso3_r2s)))
print('EN_dummy scores: ', cv_EN3_r2s)
print('EN_dummy mean cv r^2: %.3f +- %.3f' %(np.mean(cv_EN3_r2s),np.std(cv_EN3_r2s)))

print('Ridge_nodummy scores: ', cv_ridge4_r2s, '\n')
print('Ridge_nodummy mean cv r^2: %.3f +- %.3f' %(np.mean(cv_ridge4_r2s),np.std(cv_ridge4_r2s)))
print('Lasso_nodummy scores: ', cv_lasso4_r2s, '\n')
print('Lasso_nodummy mean cv r^2: %.3f +- %.3f' %(np.mean(cv_lasso4_r2s),np.std(cv_lasso4_r2s)))
print('EN_nodummy scores: ', cv_EN4_r2s)
print('EN_nodummy mean cv r^2: %.3f +- %.3f' %(np.mean(cv_EN4_r2s),np.std(cv_EN4_r2s)))

In [None]:
scaler = StandardScaler()
b3_X_test_scaled = scaler.fit_transform(b3_X_test)
b4_X_test_scaled = scaler.fit_transform(b4_X_test)

lasso_v3_pred = lasso_v3.predict(b3_X_test_scaled)
print("Lasso Regression:", r2_score(b3_Y_test, lasso_v3_pred))
ridge_v3_pred = ridge_v3.predict(b3_X_test_scaled)
print("Ridge Regression:", r2_score(b3_Y_test, ridge_v3_pred))
EN_v3_pred = EN_v3.predict(b3_X_test_scaled)
print("ElasticNet Regression:", r2_score(b3_Y_test, EN_v3_pred))
lasso_v4_pred = lasso_v4.predict(b4_X_test_scaled)
print("Lasso_nodummy Regression:", r2_score(b4_Y_test, lasso_v4_pred))
ridge_v4_pred = ridge_v4.predict(b4_X_test_scaled)
print("Ridge_nodummy Regression:", r2_score(b4_Y_test, ridge_v4_pred))
EN_v4_pred = EN_v3.predict(b3_X_test_scaled)
print("ElasticNet_nodummy Regression:", r2_score(b4_Y_test, EN_v4_pred))

In [None]:
#NLP + Dummy feature selection
models = {}

models['ridgev3'] = ridge_v3
models['lassov3'] = lasso_v3
models['ENv3'] = EN_v3

for name,model in models.items():
    model.fit(b3_X_train,b3_Y_train)
    print('Model: ' + name)
    print("Score: " + str(model.score(b3_X_train,b3_Y_train)))
    sorted_features = sorted(zip(project_vec_df.columns,model.coef_), key=lambda tup: abs(tup[1]), reverse=True)
    for feature in sorted_features:
        print(feature)
        
    print("")

In [None]:
lassov3fs = smf.ols("Total_Pledged ~ Location_NV + Number_of_Backer + Goal + blood + Category_Comics + \
Location_MA + Location_TN + media + Category_Art + platform + Location_France + Category_Food + comics +\
Category_Publishing + bbq + money + photobook + desert + radio", data = project_vec_df)
lassov3fs1 = lassov3fs.fit()
print(lassov3fs1.summary())

In [None]:
#residual plot
sns.residplot(x = lasso_v3_pred, y = b3_Y_test-lasso_v3_pred, data = add_dummies_dfs, scatter_kws={'alpha':0.2})
plt.xlabel('Predicted Value')
plt.ylabel('Residual')
plt.title('Residual Plot for Baseline Model')

In [None]:
models['ridgev4'] = ridge_v4
models['lassov4'] = lasso_v4
models['ENv4'] = EN_v4

for name,model in models.items():
    model.fit(b4_X_train,b4_Y_train)
    print('Model: ' + name)
    print("Score: " + str(model.score(b4_X_train,b4_Y_train)))
    sorted_features = sorted(zip(project_vec_df_nodummy.columns,model.coef_), key=lambda tup: abs(tup[1]), reverse=True)
    for feature in sorted_features:
        print(feature)
        
    print("")

In [None]:
lassov4fs = smf.ols("Total_Pledged ~ radio + desert + photobook + \
money + bbq + comics + memory + media + platform + experiment + blood + Number_of_Backer +\
Goal + Number_of_Pledge_Options", data = project_vec_df_nodummy)
lassov4fs1 = lassov4fs.fit()
print(lassov4fs1.summary())

In [None]:
#with dummy top 5(out of 1000)
weights = np.asarray(project_tfidf_vec.mean(axis=0)).ravel().tolist()
counts_df = pd.DataFrame({'term': vectorizer.get_feature_names(), 'weight': weights})
top5_feature = counts_df.sort_values(by='weight', ascending=False).head(5)
top5_feature

sns.barplot(x=top5_feature.term, y=top5_feature.weight, data=top5_feature)
plt.xlabel('Most Associated Vocabulary')
plt.ylabel('TFIDF Weight')

In [None]:
#with dummy last 5(out of 1000)
tail5_feature = counts_df.sort_values(by='weight', ascending=True).head(5)
tail5_feature

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

weights = np.asarray(tvec_weights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': tvec.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(20)

In [None]:
#residual plot
sns.residplot(x = lasso_v4_pred, y = b4_Y_test-lasso_v4_pred, data = project_vec_df_nodummy, scatter_kws={'alpha':0.2})
plt.xlabel('Predicted Value')
plt.ylabel('Residual')
plt.title('Residual Plot for lasso final Model')

### Due to high colinearity of the dataset, trying to GBR and RF as option to see how the model will do

##### Baseline

In [None]:
GBM_baseline, rf_baseline = [], [] #collect the validation results for base models

for train_ind, val_ind in kf.split(b_X_train, b_Y_train):
    
    X_train, Y_train = b_X_train[train_ind], b_Y_train[train_ind]
    X_val, Y_val = b_X_train[val_ind], b_Y_train[val_ind] 
    
    # RF 
    rf_b = RandomForestRegressor(n_estimators=800, max_features=3)
    rf_b.fit(X_train, Y_train)
    rf_baseline.append(rf_b.score(X_val, Y_val))
    
    # GBM
    gbm_b = GradientBoostingRegressor(n_estimators=100, max_depth=3, learning_rate=.1)
    gbm_b.fit(X_train, Y_train)
    GBM_baseline.append(gbm_b.score(X_val, Y_val))


#Examine baseline RF Model:
print('Baseline RF scores: %s' % rf_baseline)
print('Baseline RF R^2: %.3f +- %.3f' % (np.mean(rf_baseline),np.std(rf_baseline)))
print('Baseline GBM scores: %s' % GBM_baseline)
print('Baseline GBM R^2: %.3f +- %.3f' % (np.mean(GBM_baseline),np.std(GBM_baseline)))

##### Dummy Variable Feature added in Model

In [None]:
GBM_v2_r2s, rf_v2_r2s = [], [] #collect the validation results for base models

for train_ind, val_ind in kf.split(b2_X_train, b2_Y_train):
    
    X_train, Y_train = b2_X_train[train_ind], b2_Y_train[train_ind]
    X_val, Y_val = b2_X_train[val_ind], b2_Y_train[val_ind] 
    
    # RF 
    rf_v2 = RandomForestRegressor(n_estimators=800, max_features=3)
    rf_v2.fit(X_train, Y_train)
    rf_v2_r2s.append(rf_v2.score(X_val, Y_val))
    
    # GBM
    gbm_v2 = GradientBoostingRegressor(n_estimators=100, max_depth=3, learning_rate=.1)
    gbm_v2.fit(X_train, Y_train)
    GBM_v2_r2s.append(gbm_v2.score(X_val, Y_val))


#Examine v2(just dummy) RF Model:
print('Baseline RF scores: %s' % rf_v2_r2s)
print('Baseline RF R^2: %.3f +- %.3f' % (np.mean(rf_v2_r2s),np.std(rf_v2_r2s)))
print('Baseline GBM scores: %s' % GBM_v2_r2s)
print('Baseline GBM R^2: %.3f +- %.3f' % (np.mean(GBM_v2_r2s),np.std(GBM_v2_r2s)))

In [None]:
GBM_v3_r2s, rf_v3_r2s = [], [] #collect the validation results for base models

for train_ind, val_ind in kf.split(b3_X_train, b3_Y_train):
    
    X_train, Y_train = b3_X_train[train_ind], b3_Y_train[train_ind]
    X_val, Y_val = b3_X_train[val_ind], b3_Y_train[val_ind] 
    
    # RF 
    rf_v3 = RandomForestRegressor(n_estimators=800, max_features=3)
    rf_v3.fit(X_train, Y_train)
    rf_v3_r2s.append(rf_v3.score(X_val, Y_val))
    
    # GBM
    gbm_v3 = GradientBoostingRegressor(n_estimators=100, max_depth=3, learning_rate=.1)
    gbm_v3.fit(X_train, Y_train)
    GBM_v3_r2s.append(gbm_v3.score(X_val, Y_val))


#Examine v3(NLP wtih dummy) RF Model:
print('Baseline RF scores: %s' % rf_v3_r2s)
print('Baseline RF R^2: %.3f +- %.3f' % (np.mean(rf_v3_r2s),np.std(rf_v3_r2s)))
print('Baseline GBM scores: %s' % GBM_v3_r2s)
print('Baseline GBM R^2: %.3f +- %.3f' % (np.mean(GBM_v3_r2s),np.std(GBM_v3_r2s)))

In [None]:
GBM_v4_r2s, rf_v4_r2s = [], [] #collect the validation results for base models

for train_ind, val_ind in kf.split(b4_X_train, b4_Y_train):
    
    X_train, Y_train = b4_X_train[train_ind], b4_Y_train[train_ind]
    X_val, Y_val = b4_X_train[val_ind], b4_Y_train[val_ind] 
    
    # RF 
    rf_v4 = RandomForestRegressor(n_estimators=800, max_features=3)
    rf_v4.fit(X_train, Y_train)
    rf_v4_r2s.append(rf_v4.score(X_val, Y_val))
    
    # GBM
    gbm_v4 = GradientBoostingRegressor(n_estimators=100, max_depth=3, learning_rate=.1)
    gbm_v4.fit(X_train, Y_train)
    GBM_v4_r2s.append(gbm_v4.score(X_val, Y_val))


#Examine v3(NLP wtih dummy) RF Model:
print('Baseline RF scores: %s' % rf_v4_r2s)
print('Baseline RF R^2: %.3f +- %.3f' % (np.mean(rf_v4_r2s),np.std(rf_v4_r2s)))
print('Baseline GBM scores: %s' % GBM_v4_r2s)
print('Baseline GBM R^2: %.3f +- %.3f' % (np.mean(GBM_v4_r2s),np.std(GBM_v4_r2s)))

In [None]:
scaler = StandardScaler()
b3_X_test_scaled = scaler.fit_transform(b3_X_test)
b4_X_test_scaled = scaler.fit_transform(b4_X_test)

lasso_v3_pred = lasso_v3.predict(b3_X_test_scaled)
print("Lasso Regression:", r2_score(b3_Y_test, lasso_v3_pred))
ridge_v3_pred = ridge_v3.predict(b3_X_test_scaled)
print("Ridge Regression:", r2_score(b3_Y_test, ridge_v3_pred))
EN_v3_pred = EN_v3.predict(b3_X_test_scaled)
print("ElasticNet Regression:", r2_score(b3_Y_test, EN_v3_pred))
lasso_v4_pred = lasso_v4.predict(b4_X_test_scaled)
print("Lasso_nodummy Regression:", r2_score(b4_Y_test, lasso_v4_pred))
ridge_v4_pred = ridge_v4.predict(b4_X_test_scaled)
print("Ridge_nodummy Regression:", r2_score(b4_Y_test, ridge_v4_pred))
EN_v4_pred = EN_v3.predict(b3_X_test_scaled)
print("ElasticNet_nodummy Regression:", r2_score(b4_Y_test, EN_v4_pred))