In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
style.use("ggplot")
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler

### Build dataset and trim features

In [3]:
# all of our usable dataframes
df = pd.read_csv('data/master_df.csv')
nlp_data = df[['blurb', 'slug', 'country']]  #also includes slug from category and some info from location

In [4]:
# read category data for name and parent name
df_category = pd.read_csv('data/category.csv')
df_category = df_category[['name', 'parent_name']]

In [5]:
# drop columns deemed unessecary
model_data = df.drop(columns = ['current_currency','static_usd_rate','usd_exchange_rate','usd_type','id','name', 'slug', 'category', 'creator', 'location', 'photo', 'profile', 'urls', 'country_displayable_name', 'currency_symbol', 'currency_trailing_code', 'disable_communication', 'source_url', 'currency', 'pledged', 'blurb'])

# create goal_usd column so that all goal amounts are in the same units, drop fx_rate and goals afterward
model_data['goal_usd'] = model_data['fx_rate']*model_data['goal']
model_data.drop(columns=['fx_rate', 'goal'])

# combine category data and all other date
model_data = pd.concat([model_data, df_category], axis=1, join='inner')

# create percentage funded column as another metric
model_data['percentage_funded'] = model_data.converted_pledged_amount/model_data.goal_usd*100

# create total days active column as another metric
model_data['total_days_active'] = (model_data.deadline-model_data.created_at)*0.00001157

model_data

Unnamed: 0,backers_count,converted_pledged_amount,country,created_at,deadline,fx_rate,goal,is_starrable,launched_at,spotlight,staff_pick,state,state_changed_at,usd_pledged,goal_usd,name,parent_name,percentage_funded,total_days_active
0,18,5034,US,1609376406,1611968831,1.000000,5000.0,False,1609545583,True,False,successful,1611968831,5034.000000,5000.000000,Cookbooks,Food,100.680000,29.994357
1,508,48365,HK,1606278560,1614096106,0.127500,350000.0,False,1608912106,True,True,successful,1614096106,48368.291331,44624.926500,Cookbooks,Food,108.381131,90.449007
2,14,98,ES,1606477096,1607900340,1.081901,50.0,False,1606751262,True,False,successful,1607900340,96.906412,54.095050,Cookbooks,Food,181.162603,16.466933
3,1486,127765,US,1604500905,1610082068,1.000000,65000.0,False,1606194068,True,True,successful,1610082068,127765.690000,65000.000000,Cookbooks,Food,196.561538,64.574056
4,249,14574,GB,1605454727,1608573895,1.304140,10600.0,False,1605981895,True,True,successful,1608573895,14480.304767,13823.881138,Cookbooks,Food,105.426254,36.088774
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30987,32,3395,US,1434905504,1443967460,1.000000,3000.0,False,1441375460,True,False,successful,1443967460,3395.000000,3000.000000,Fiction,Publishing,113.166667,104.846831
30988,284,21112,US,1440643700,1443937679,1.000000,12000.0,False,1441345679,True,False,successful,1443937681,21112.150000,12000.000000,Fiction,Publishing,175.933333,38.111337
30989,15,2000,US,1441111356,1446319364,1.000000,2000.0,False,1441135364,True,False,successful,1446319364,2000.000000,2000.000000,Fiction,Publishing,100.000000,60.256653
30990,22,3125,US,1436479764,1443703429,1.000000,3000.0,False,1441111429,True,False,successful,1443703429,3125.000000,3000.000000,Fiction,Publishing,104.166667,83.577804


In [6]:
model_data.isna().sum().sum()

1374

In [7]:
model_data = model_data.dropna()
model_data.shape

(29643, 19)

In [8]:
# Creating an instance of label encoder
label_encoder = LabelEncoder()


In [9]:
# Fitting the label encoder
label_encoder.fit(model_data['country'])

LabelEncoder()

In [10]:
# List the classes identified by the label encoder
list(label_encoder.classes_)

['AT',
 'AU',
 'BE',
 'CA',
 'CH',
 'DE',
 'DK',
 'ES',
 'FR',
 'GB',
 'GR',
 'HK',
 'IE',
 'IT',
 'JP',
 'LU',
 'MX',
 'NL',
 'NO',
 'NZ',
 'PL',
 'SE',
 'SG',
 'SI',
 'US']

In [11]:
# Encode the country as an integer
model_data['country_le'] = label_encoder.transform(model_data['country'])
model_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,backers_count,converted_pledged_amount,country,created_at,deadline,fx_rate,goal,is_starrable,launched_at,spotlight,staff_pick,state,state_changed_at,usd_pledged,goal_usd,name,parent_name,percentage_funded,total_days_active,country_le
0,18,5034,US,1609376406,1611968831,1.0,5000.0,False,1609545583,True,False,successful,1611968831,5034.0,5000.0,Cookbooks,Food,100.68,29.994357,24
1,508,48365,HK,1606278560,1614096106,0.1275,350000.0,False,1608912106,True,True,successful,1614096106,48368.291331,44624.9265,Cookbooks,Food,108.381131,90.449007,11
2,14,98,ES,1606477096,1607900340,1.081901,50.0,False,1606751262,True,False,successful,1607900340,96.906412,54.09505,Cookbooks,Food,181.162603,16.466933,7
3,1486,127765,US,1604500905,1610082068,1.0,65000.0,False,1606194068,True,True,successful,1610082068,127765.69,65000.0,Cookbooks,Food,196.561538,64.574056,24
4,249,14574,GB,1605454727,1608573895,1.30414,10600.0,False,1605981895,True,True,successful,1608573895,14480.304767,13823.881138,Cookbooks,Food,105.426254,36.088774,9


In [12]:
# Dropping country column
model_data.drop(['country'], axis=1, inplace=True)
model_data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,backers_count,converted_pledged_amount,created_at,deadline,fx_rate,goal,is_starrable,launched_at,spotlight,staff_pick,state,state_changed_at,usd_pledged,goal_usd,name,parent_name,percentage_funded,total_days_active,country_le
0,18,5034,1609376406,1611968831,1.0,5000.0,False,1609545583,True,False,successful,1611968831,5034.0,5000.0,Cookbooks,Food,100.68,29.994357,24
1,508,48365,1606278560,1614096106,0.1275,350000.0,False,1608912106,True,True,successful,1614096106,48368.291331,44624.9265,Cookbooks,Food,108.381131,90.449007,11
2,14,98,1606477096,1607900340,1.081901,50.0,False,1606751262,True,False,successful,1607900340,96.906412,54.09505,Cookbooks,Food,181.162603,16.466933,7
3,1486,127765,1604500905,1610082068,1.0,65000.0,False,1606194068,True,True,successful,1610082068,127765.69,65000.0,Cookbooks,Food,196.561538,64.574056,24
4,249,14574,1605454727,1608573895,1.30414,10600.0,False,1605981895,True,True,successful,1608573895,14480.304767,13823.881138,Cookbooks,Food,105.426254,36.088774,9


In [13]:
# Fitting the label encoder
label_encoder.fit(model_data['name'])

LabelEncoder()

In [14]:
# Encode the name as an integer
model_data['name_le'] = label_encoder.transform(model_data['name'])
model_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,backers_count,converted_pledged_amount,created_at,deadline,fx_rate,goal,is_starrable,launched_at,spotlight,staff_pick,state,state_changed_at,usd_pledged,goal_usd,name,parent_name,percentage_funded,total_days_active,country_le,name_le
0,18,5034,1609376406,1611968831,1.0,5000.0,False,1609545583,True,False,successful,1611968831,5034.0,5000.0,Cookbooks,Food,100.68,29.994357,24,7
1,508,48365,1606278560,1614096106,0.1275,350000.0,False,1608912106,True,True,successful,1614096106,48368.291331,44624.9265,Cookbooks,Food,108.381131,90.449007,11,7
2,14,98,1606477096,1607900340,1.081901,50.0,False,1606751262,True,False,successful,1607900340,96.906412,54.09505,Cookbooks,Food,181.162603,16.466933,7,7
3,1486,127765,1604500905,1610082068,1.0,65000.0,False,1606194068,True,True,successful,1610082068,127765.69,65000.0,Cookbooks,Food,196.561538,64.574056,24,7
4,249,14574,1605454727,1608573895,1.30414,10600.0,False,1605981895,True,True,successful,1608573895,14480.304767,13823.881138,Cookbooks,Food,105.426254,36.088774,9,7


In [15]:
# Dropping name column
model_data.drop(['name'], axis=1, inplace=True)
model_data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,backers_count,converted_pledged_amount,created_at,deadline,fx_rate,goal,is_starrable,launched_at,spotlight,staff_pick,state,state_changed_at,usd_pledged,goal_usd,parent_name,percentage_funded,total_days_active,country_le,name_le
0,18,5034,1609376406,1611968831,1.0,5000.0,False,1609545583,True,False,successful,1611968831,5034.0,5000.0,Food,100.68,29.994357,24,7
1,508,48365,1606278560,1614096106,0.1275,350000.0,False,1608912106,True,True,successful,1614096106,48368.291331,44624.9265,Food,108.381131,90.449007,11,7
2,14,98,1606477096,1607900340,1.081901,50.0,False,1606751262,True,False,successful,1607900340,96.906412,54.09505,Food,181.162603,16.466933,7,7
3,1486,127765,1604500905,1610082068,1.0,65000.0,False,1606194068,True,True,successful,1610082068,127765.69,65000.0,Food,196.561538,64.574056,24,7
4,249,14574,1605454727,1608573895,1.30414,10600.0,False,1605981895,True,True,successful,1608573895,14480.304767,13823.881138,Food,105.426254,36.088774,9,7


In [16]:
# Fitting the label encoder
label_encoder.fit(model_data['parent_name'])

LabelEncoder()

In [17]:
# Encode the parent name as an integer
model_data['parent_name_le'] = label_encoder.transform(model_data['parent_name'])
model_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,backers_count,converted_pledged_amount,created_at,deadline,fx_rate,goal,is_starrable,launched_at,spotlight,staff_pick,state,state_changed_at,usd_pledged,goal_usd,parent_name,percentage_funded,total_days_active,country_le,name_le,parent_name_le
0,18,5034,1609376406,1611968831,1.0,5000.0,False,1609545583,True,False,successful,1611968831,5034.0,5000.0,Food,100.68,29.994357,24,7,4
1,508,48365,1606278560,1614096106,0.1275,350000.0,False,1608912106,True,True,successful,1614096106,48368.291331,44624.9265,Food,108.381131,90.449007,11,7,4
2,14,98,1606477096,1607900340,1.081901,50.0,False,1606751262,True,False,successful,1607900340,96.906412,54.09505,Food,181.162603,16.466933,7,7,4
3,1486,127765,1604500905,1610082068,1.0,65000.0,False,1606194068,True,True,successful,1610082068,127765.69,65000.0,Food,196.561538,64.574056,24,7,4
4,249,14574,1605454727,1608573895,1.30414,10600.0,False,1605981895,True,True,successful,1608573895,14480.304767,13823.881138,Food,105.426254,36.088774,9,7,4


In [18]:
# Dropping parent_name column
model_data.drop(['parent_name'], axis=1, inplace=True)
model_data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,backers_count,converted_pledged_amount,created_at,deadline,fx_rate,goal,is_starrable,launched_at,spotlight,staff_pick,state,state_changed_at,usd_pledged,goal_usd,percentage_funded,total_days_active,country_le,name_le,parent_name_le
0,18,5034,1609376406,1611968831,1.0,5000.0,False,1609545583,True,False,successful,1611968831,5034.0,5000.0,100.68,29.994357,24,7,4
1,508,48365,1606278560,1614096106,0.1275,350000.0,False,1608912106,True,True,successful,1614096106,48368.291331,44624.9265,108.381131,90.449007,11,7,4
2,14,98,1606477096,1607900340,1.081901,50.0,False,1606751262,True,False,successful,1607900340,96.906412,54.09505,181.162603,16.466933,7,7,4
3,1486,127765,1604500905,1610082068,1.0,65000.0,False,1606194068,True,True,successful,1610082068,127765.69,65000.0,196.561538,64.574056,24,7,4
4,249,14574,1605454727,1608573895,1.30414,10600.0,False,1605981895,True,True,successful,1608573895,14480.304767,13823.881138,105.426254,36.088774,9,7,4


In [19]:
# Fitting the label encoder
label_encoder.fit(model_data['state'])

LabelEncoder()

In [20]:
# List the classes identified by the label encoder
list(label_encoder.classes_)

['canceled', 'failed', 'live', 'successful']

In [21]:
# Encode the state as an integer
model_data['state_le'] = label_encoder.transform(model_data['state'])
model_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,backers_count,converted_pledged_amount,created_at,deadline,fx_rate,goal,is_starrable,launched_at,spotlight,staff_pick,state,state_changed_at,usd_pledged,goal_usd,percentage_funded,total_days_active,country_le,name_le,parent_name_le,state_le
0,18,5034,1609376406,1611968831,1.0,5000.0,False,1609545583,True,False,successful,1611968831,5034.0,5000.0,100.68,29.994357,24,7,4,3
1,508,48365,1606278560,1614096106,0.1275,350000.0,False,1608912106,True,True,successful,1614096106,48368.291331,44624.9265,108.381131,90.449007,11,7,4,3
2,14,98,1606477096,1607900340,1.081901,50.0,False,1606751262,True,False,successful,1607900340,96.906412,54.09505,181.162603,16.466933,7,7,4,3
3,1486,127765,1604500905,1610082068,1.0,65000.0,False,1606194068,True,True,successful,1610082068,127765.69,65000.0,196.561538,64.574056,24,7,4,3
4,249,14574,1605454727,1608573895,1.30414,10600.0,False,1605981895,True,True,successful,1608573895,14480.304767,13823.881138,105.426254,36.088774,9,7,4,3


In [22]:
# Dropping state column
model_data.drop(['state'], axis=1, inplace=True)
model_data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,backers_count,converted_pledged_amount,created_at,deadline,fx_rate,goal,is_starrable,launched_at,spotlight,staff_pick,state_changed_at,usd_pledged,goal_usd,percentage_funded,total_days_active,country_le,name_le,parent_name_le,state_le
0,18,5034,1609376406,1611968831,1.0,5000.0,False,1609545583,True,False,1611968831,5034.0,5000.0,100.68,29.994357,24,7,4,3
1,508,48365,1606278560,1614096106,0.1275,350000.0,False,1608912106,True,True,1614096106,48368.291331,44624.9265,108.381131,90.449007,11,7,4,3
2,14,98,1606477096,1607900340,1.081901,50.0,False,1606751262,True,False,1607900340,96.906412,54.09505,181.162603,16.466933,7,7,4,3
3,1486,127765,1604500905,1610082068,1.0,65000.0,False,1606194068,True,True,1610082068,127765.69,65000.0,196.561538,64.574056,24,7,4,3
4,249,14574,1605454727,1608573895,1.30414,10600.0,False,1605981895,True,True,1608573895,14480.304767,13823.881138,105.426254,36.088774,9,7,4,3


In [23]:
# Creating the scaler instance
data_scaler = StandardScaler()

In [24]:
# Fitting the scaler
data_scaler.fit(model_data)

StandardScaler()

In [25]:
# Transforming the data
model_data_scaled = data_scaler.transform(model_data)
model_data_scaled[:5]

array([[-1.07940827e-01, -3.67311533e-02,  1.14682252e+00,
         1.08891687e+00,  9.68707715e-02, -3.88909303e-02,
        -1.61993809e-01,  1.09502966e+00,  7.74812699e-01,
        -4.51903496e-01,  1.09209468e+00, -3.67898415e-02,
        -3.27280322e-02, -1.56694793e-02, -3.64576607e-01,
         6.35900967e-01, -1.19958738e+00, -3.50153075e-01,
         7.52342802e-01],
       [ 2.86310012e-01,  1.32554412e-01,  1.11144061e+00,
         1.11318792e+00, -3.54558593e+00,  3.26952412e-01,
        -1.61993809e-01,  1.08779775e+00,  7.74812699e-01,
         2.21286184e+00,  1.11639792e+00,  1.32500188e-01,
         2.80137931e-02, -1.54877577e-02,  1.87440428e-02,
        -1.04397455e+00, -1.19958738e+00, -3.50153075e-01,
         7.52342802e-01],
       [-1.11159201e-01, -5.60151198e-02,  1.11370818e+00,
         1.04249760e+00,  4.38785657e-01, -4.41399869e-02,
        -1.61993809e-01,  1.06312909e+00,  7.74812699e-01,
        -4.51903496e-01,  1.04561383e+00, -5.60771240e-02,
    

In [26]:
# split into X and y variables
X = model_data.drop(columns=['state_le'])
y = model_data.state_le

In [27]:
# Use the train_test_split function to create training and testing subsets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, test_size = 0.3,
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(20750, 18)

In [28]:
# Instantiate a linear SVM model
from sklearn.svm import SVC
classifier = SVC(kernel='linear',max_iter=100000)
classifier

SVC(kernel='linear', max_iter=100000)

In [29]:
# Fit the data
classifier.fit(X_train, y_train)



SVC(kernel='linear', max_iter=100000)

In [30]:
# Score the accuracy
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.6973493975903614
Testing Data Score: 0.706623186776116


In [31]:
# Make predictions using the test data
predictions = classifier.predict(X_test)
results = pd.DataFrame({
    "Prediction": predictions, 
    "Actual": y_test
}).reset_index(drop=True)
results.head()

Unnamed: 0,Prediction,Actual
0,3,3
1,3,1
2,3,3
3,0,0
4,1,1


In [33]:
X_test

Unnamed: 0,backers_count,converted_pledged_amount,created_at,deadline,fx_rate,goal,is_starrable,launched_at,spotlight,staff_pick,state_changed_at,usd_pledged,goal_usd,percentage_funded,total_days_active,country_le,name_le,parent_name_le
21992,479,4848,1612503993,1617760800,1.000000,100.0,False,1615924712,True,False,1617760800,4848.000000,100.00000,4848.000000,60.821257,24,12,7
3545,2,2,1564355197,1568736000,1.000000,4350.0,False,1565611418,False,False,1568736000,2.000000,4350.00000,0.045977,50.685891,24,31,8
4263,620,33100,1556674617,1561687200,1.000000,20000.0,False,1559065814,True,True,1561687200,33100.230000,20000.00000,165.500000,57.995585,24,15,1
20707,3,80,1444929001,1447531968,1.000000,4000.0,False,1444936368,False,False,1445023866,80.000000,4000.00000,2.000000,30.116328,24,26,6
21620,2,501,1640647899,1644518846,1.000000,160000.0,False,1641926846,False,False,1644518846,501.000000,160000.00000,0.313125,44.786857,24,4,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28924,86,6322,1608878595,1614718017,1.000000,5000.0,False,1609534017,True,False,1614718017,6322.000000,5000.00000,126.440000,67.562113,24,9,4
15946,1,5,1432998496,1435591200,1.000000,30000.0,False,1433012836,False,False,1435591205,5.000000,30000.00000,0.016667,29.997585,24,10,4
7443,2,33,1492466651,1495061406,0.794451,1000.0,False,1492469406,False,False,1495061408,34.515178,794.45092,4.153812,30.021315,3,27,6
1070,1254,317760,1474558281,1484223520,1.081901,20000.0,False,1481631520,True,True,1484223520,316637.972354,21638.01980,1468.526247,111.826815,5,32,8


In [70]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
cm, index=['Actual 0', 'Actual 1', 'Actual 2', 'Actual 3'], columns=['Predicted 0', 'Predicted 1', 'Predicted 2', 'Predicted 3']
)
# Displaying results
print("Confusion Matrix")
display(cm_df)

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2,Predicted 3
Actual 0,240,10,1,35
Actual 1,0,1078,0,1743
Actual 2,0,0,229,0
Actual 3,0,820,0,4737


In [71]:
# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      0.84      0.91       286
           1       0.56      0.38      0.46      2821
           2       1.00      1.00      1.00       229
           3       0.73      0.85      0.78      5557

    accuracy                           0.71      8893
   macro avg       0.82      0.77      0.79      8893
weighted avg       0.69      0.71      0.69      8893

