![logo](1_bDwEvCRgrKVbLrAXEixpfA.png)
___

##### import libraries

In [81]:
import pandas as pd
import numpy as np
import re
import statsmodels.api as sm
import functools as ft

import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
%matplotlib inline 
sns.set(color_codes=True)

from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


# Step 3(b) - Dimensionality Reduction
    a) Importing data
    b) Variance Threshold
    c) Backwards Elimination
    d) Principal Component Analysis

### a) Importing data
##### Use 'cleaned_data.csv' to begin cleaning

In [86]:
#import .csv file
cleaned2_df = pd.read_csv("cleaned2_data.csv")

cleaned2_df.drop(['Unnamed: 0'], axis=1, inplace=True)

print(cleaned2_df.shape)
cleaned2_df.head(1)

(10062, 22)


Unnamed: 0,id,name,genre,subgenre,category,source_url,blurb,slug,goal,converted_pledged_amount,...,deadline,country,currency,backers_count,disable_communication,is_starrable,spotlight,staff_pick,state,success_percentage
0,656694037,The Treehouse at Cornell,Architecture,design,"{""id"":258,""name"":""Architecture"",""slug"":""design...",https://www.kickstarter.com/discover/categorie...,The Treehouse at Cornell takes architecture to...,the-treehouse-at-cornell,5000,1990,...,2014-04-29 16:32:09,US,USD,35,0,0,0,1,failed,39.8


### b) Variance Threshold

In [87]:
#splitting the data into X and Y / separating the predictor variable 
#leaving only the numeric variables

X = cleaned2_df.filter(['id',
                       'goal',
                       'coverted_pledged_amount',
                       'country',
                       'currency',
                       'backers_count',
                       'disable_communication',
                       'is_starrable',
                       'spotlight',
                       'staff_pick',
                       'success_percentage'], axis=1)
y = cleaned2_df['state']          

In [88]:
print(X.country.unique())
print(X.currency.unique())

#replace US/USD & CA/CAN with 1 & 0 respentively

X.country.replace(['US', 'CA'], [1, 0], inplace=True)
X.currency.replace(['USD', 'CAD'], [1, 0], inplace=True)

print('\n')
print(X.country.unique())
print(X.currency.unique())

['US' 'CA']
['USD' 'CAD']


[1 0]
[1 0]


In [89]:
print(y.unique())

#replace successful/failed with 1 & 0 respentively
y.replace(['successful', 'failed'], [1, 0], inplace=True)

print('\n')
print(y.unique())

['failed' 'successful']


[0 1]


In [90]:
#testing for the threshold of 0.5
thresh = 0.5
selector = VarianceThreshold(threshold=thresh)
vt = selector.fit_transform(X)

In [91]:
#all the columns the Variance Threshold is removing
VarElim = (X.columns.difference(X.loc[:, selector.get_support()].columns)).tolist()
print(VarElim)
print(len(VarElim))

['country', 'currency', 'disable_communication', 'is_starrable', 'spotlight', 'staff_pick']
6


### c) Backward Elimination

In [92]:
#adding constant column of ones, mandatory for sm.OLS model
X_1 = sm.add_constant(X)
print(X_1.dtypes)

const                    float64
id                         int64
goal                       int64
country                    int64
currency                   int64
backers_count              int64
disable_communication      int64
is_starrable               int64
spotlight                  int64
staff_pick                 int64
success_percentage       float64
dtype: object


  return ptp(axis=axis, out=out, **kwargs)


In [93]:
#fitting sm model
model = sm.OLS(y,X_1).fit()
model.pvalues.sort_values()

id                       0.000000
disable_communication    0.000000
is_starrable             0.000000
spotlight                0.000000
country                  0.580668
currency                 0.596364
backers_count            0.721915
const                    0.744655
goal                     0.767166
staff_pick               0.945059
success_percentage       0.965811
dtype: float64

In [94]:
#Backward Elimination
cols = list(X.columns)
pmax = 1
while (len(cols)>0):
    p= []
    X_1 = X[cols]
    X_1 = sm.add_constant(X_1)
    model = sm.OLS(y,X_1).fit()
    p = pd.Series(model.pvalues.values[1:],index = cols)      
    pmax = max(p)
    feature_with_p_max = p.idxmax()
    if(pmax>0.05):
        cols.remove(feature_with_p_max)
    else:
        break
selected_features_BE = cols
print(selected_features_BE)
print(len(selected_features_BE))

['disable_communication', 'is_starrable', 'spotlight']
3


  return self.params / self.bse
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


### d) Principal Component Analysis

In [102]:
#splitting data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [103]:
#feature scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [104]:
#apply PCA
pca = PCA(n_components = None)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [105]:
exp_variance = pca.explained_variance_ratio_
print(exp_variance)
print (np.count_nonzero(exp_variance))

[3.29439944e-01 2.30647346e-01 1.51070812e-01 1.24964949e-01
 1.06047032e-01 4.82458643e-02 9.58405351e-03 6.27688161e-33
 1.42531557e-34 7.90437220e-40]
10


In [106]:
for i in range(0,np.count_nonzero(exp_variance + 1)):
    print(i," ", sum(exp_variance[0:i]))

0   0
1   0.32943994414858047
2   0.5600872898194567
3   0.7111581016783843
4   0.8361230502913168
5   0.9421700822171512
6   0.990415946489143
7   0.9999999999999999
8   0.9999999999999999
9   0.9999999999999999


### End of Step 3(b)

In [328]:
#write cleaned cleaned_df to a .csv
cleaned2_df.to_csv('cleaned2_data_pca.csv')