In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

### Build dataset and trim features

In [2]:
# all of our usable dataframes
df = pd.read_csv('data/master_df.csv')
nlp_data = df[['blurb', 'slug', 'country']]  #also includes slug from category and some info from location

In [3]:
# read category data for name and parent name
df_category = pd.read_csv('data/category.csv')
df_category = df_category[['name', 'parent_name']]

In [12]:
# drop columns deemed unessecary
model_data = df.drop(columns = ['current_currency','static_usd_rate','usd_exchange_rate','usd_type','id','name', 'slug', 'category', 'creator', 'location', 'photo', 'profile', 'urls', 'country_displayable_name', 'currency_symbol', 'currency_trailing_code', 'disable_communication', 'source_url', 'currency', 'pledged', 'blurb'])

# create goal_usd column so that all goal amounts are in the same units, drop fx_rate and goals afterward
model_data['goal_usd'] = model_data['fx_rate']*model_data['goal']
model_data.drop(columns=['fx_rate', 'goal'])

# combine category data and all other date
model_data = pd.concat([model_data, df_category], axis=1, join='inner')

# create percentage funded column as another metric
model_data['percentage_funded'] = model_data.converted_pledged_amount/model_data.goal_usd*100

# create total days active column as another metric
model_data['total_days_active'] = (model_data.deadline-model_data.created_at)*0.00001157

model_data

Unnamed: 0,backers_count,converted_pledged_amount,country,created_at,deadline,fx_rate,goal,is_starrable,launched_at,spotlight,staff_pick,state,state_changed_at,usd_pledged,goal_usd,name,parent_name,percentage_funded,total_days_active
0,18,5034,US,1609376406,1611968831,1.000000,5000.0,False,1609545583,True,False,successful,1611968831,5034.000000,5000.000000,Cookbooks,Food,100.680000,29.994357
1,508,48365,HK,1606278560,1614096106,0.127500,350000.0,False,1608912106,True,True,successful,1614096106,48368.291331,44624.926500,Cookbooks,Food,108.381131,90.449007
2,14,98,ES,1606477096,1607900340,1.081901,50.0,False,1606751262,True,False,successful,1607900340,96.906412,54.095050,Cookbooks,Food,181.162603,16.466933
3,1486,127765,US,1604500905,1610082068,1.000000,65000.0,False,1606194068,True,True,successful,1610082068,127765.690000,65000.000000,Cookbooks,Food,196.561538,64.574056
4,249,14574,GB,1605454727,1608573895,1.304140,10600.0,False,1605981895,True,True,successful,1608573895,14480.304767,13823.881138,Cookbooks,Food,105.426254,36.088774
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30987,4,501,US,1632678330,1634318870,1.000000,10000.0,False,1633022870,False,False,failed,1634318870,501.000000,10000.000000,Software,Technology,5.010000,18.981048
30988,1,1,CA,1632726042,1638144718,0.794451,50000.0,False,1632957118,False,False,failed,1638144719,1.022898,39722.546000,Software,Technology,0.002517,62.694081
30989,1,0,DK,1631517750,1635408514,0.145430,1000000.0,False,1632816514,False,False,failed,1635408514,0.972658,145430.220000,Software,Technology,0.000000,45.016139
30990,3,70,US,1429554526,1432913659,1.000000,35000.0,False,1430321659,False,False,failed,1432913660,70.000000,35000.000000,Plays,Theater,0.200000,38.865169


In [13]:
model_data.isna().sum().sum()

1374

In [14]:
model_data = model_data.dropna()
model_data.shape

(29643, 19)

In [22]:
# split into X and y variables
X = model_data.drop(columns=['state'])
y = model_data.state

### Encoding of non neumerical features