In [8]:
#importing library
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
import tensorflow as tf

In [9]:
#loading the data
df=pd.read_csv('../input/kickstarter-projects/ks-projects-201801.csv')
df

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.00
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.00
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.00
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378656,999976400,ChknTruk Nationwide Charity Drive 2014 (Canceled),Documentary,Film & Video,USD,2014-10-17,50000.0,2014-09-17 02:35:30,25.0,canceled,1,US,25.0,25.0,50000.00
378657,999977640,The Tribe,Narrative Film,Film & Video,USD,2011-07-19,1500.0,2011-06-22 03:35:14,155.0,failed,5,US,155.0,155.0,1500.00
378658,999986353,Walls of Remedy- New lesbian Romantic Comedy f...,Narrative Film,Film & Video,USD,2010-08-16,15000.0,2010-07-01 19:40:30,20.0,failed,1,US,20.0,20.0,15000.00
378659,999987933,BioDefense Education Kit,Technology,Technology,USD,2016-02-13,15000.0,2016-01-13 18:13:53,200.0,failed,6,US,200.0,200.0,15000.00


In [10]:
#getting information about the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378661 entries, 0 to 378660
Data columns (total 15 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   ID                378661 non-null  int64  
 1   name              378657 non-null  object 
 2   category          378661 non-null  object 
 3   main_category     378661 non-null  object 
 4   currency          378661 non-null  object 
 5   deadline          378661 non-null  object 
 6   goal              378661 non-null  float64
 7   launched          378661 non-null  object 
 8   pledged           378661 non-null  float64
 9   state             378661 non-null  object 
 10  backers           378661 non-null  int64  
 11  country           378661 non-null  object 
 12  usd pledged       374864 non-null  float64
 13  usd_pledged_real  378661 non-null  float64
 14  usd_goal_real     378661 non-null  float64
dtypes: float64(5), int64(2), object(8)
memory usage: 43.3+ MB


In [11]:
unneeded_columns=['ID','name']
df=df.drop(unneeded_columns,axis=1)

In [12]:
#checking for missing values
df.isna().sum()

category               0
main_category          0
currency               0
deadline               0
goal                   0
launched               0
pledged                0
state                  0
backers                0
country                0
usd pledged         3797
usd_pledged_real       0
usd_goal_real          0
dtype: int64

In [13]:
#replacing missing values with mean value
df['usd pledged']=df['usd pledged'].fillna(df['usd pledged'].mean())


In [14]:
#again checking for misisng values
df.isna().sum()

category            0
main_category       0
currency            0
deadline            0
goal                0
launched            0
pledged             0
state               0
backers             0
country             0
usd pledged         0
usd_pledged_real    0
usd_goal_real       0
dtype: int64

In [15]:
#checking for unique values
df['state'].unique()

array(['failed', 'canceled', 'successful', 'live', 'undefined',
       'suspended'], dtype=object)

In [16]:
#droping the column other than failed and successful
df=df.drop(df.query("state!='failed' and state!='successful'").index,axis=0).reset_index(drop=True)

In [17]:
df

Unnamed: 0,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.00
2,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.00
3,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.00
4,Restaurants,Food,USD,2016-04-01,50000.0,2016-02-26 13:38:27,52375.0,successful,224,US,52375.0,52375.0,50000.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
331670,Small Batch,Food,USD,2017-04-19,6500.0,2017-03-20 22:08:22,154.0,failed,4,US,0.0,154.0,6500.00
331671,Narrative Film,Film & Video,USD,2011-07-19,1500.0,2011-06-22 03:35:14,155.0,failed,5,US,155.0,155.0,1500.00
331672,Narrative Film,Film & Video,USD,2010-08-16,15000.0,2010-07-01 19:40:30,20.0,failed,1,US,20.0,20.0,15000.00
331673,Technology,Technology,USD,2016-02-13,15000.0,2016-01-13 18:13:53,200.0,failed,6,US,200.0,200.0,15000.00


In [18]:
#again checking for unique value
df['state'].unique()

array(['failed', 'successful'], dtype=object)

# Feature Engineering and Encoding

In [19]:
#extacting only year and month from  the deadline column
df['deadline_year']=df['deadline'].apply(lambda x: np.float(x[0:4]))
df['deadline_month']=df['deadline'].apply(lambda x: np.float(x[5:7]))
df['launched_year']=df['launched'].apply(lambda x: np.float(x[0:4]))
df['launched_month']=df['launched'].apply(lambda x: np.float(x[5:7]))

In [20]:
#droping the deadline and launched
df=df.drop(['deadline','launched'],axis=1)

In [21]:
df

Unnamed: 0,category,main_category,currency,goal,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,deadline_year,deadline_month,launched_year,launched_month
0,Poetry,Publishing,GBP,1000.0,0.0,failed,0,GB,0.0,0.0,1533.95,2015.0,10.0,2015.0,8.0
1,Narrative Film,Film & Video,USD,30000.0,2421.0,failed,15,US,100.0,2421.0,30000.00,2017.0,11.0,2017.0,9.0
2,Narrative Film,Film & Video,USD,45000.0,220.0,failed,3,US,220.0,220.0,45000.00,2013.0,2.0,2013.0,1.0
3,Music,Music,USD,5000.0,1.0,failed,1,US,1.0,1.0,5000.00,2012.0,4.0,2012.0,3.0
4,Restaurants,Food,USD,50000.0,52375.0,successful,224,US,52375.0,52375.0,50000.00,2016.0,4.0,2016.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331670,Small Batch,Food,USD,6500.0,154.0,failed,4,US,0.0,154.0,6500.00,2017.0,4.0,2017.0,3.0
331671,Narrative Film,Film & Video,USD,1500.0,155.0,failed,5,US,155.0,155.0,1500.00,2011.0,7.0,2011.0,6.0
331672,Narrative Film,Film & Video,USD,15000.0,20.0,failed,1,US,20.0,20.0,15000.00,2010.0,8.0,2010.0,7.0
331673,Technology,Technology,USD,15000.0,200.0,failed,6,US,200.0,200.0,15000.00,2016.0,2.0,2016.0,1.0


In [22]:
#changing state column the numerical value
df['state']=df['state'].apply(lambda x: 1 if x=='successful' else 0)
df

Unnamed: 0,category,main_category,currency,goal,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,deadline_year,deadline_month,launched_year,launched_month
0,Poetry,Publishing,GBP,1000.0,0.0,0,0,GB,0.0,0.0,1533.95,2015.0,10.0,2015.0,8.0
1,Narrative Film,Film & Video,USD,30000.0,2421.0,0,15,US,100.0,2421.0,30000.00,2017.0,11.0,2017.0,9.0
2,Narrative Film,Film & Video,USD,45000.0,220.0,0,3,US,220.0,220.0,45000.00,2013.0,2.0,2013.0,1.0
3,Music,Music,USD,5000.0,1.0,0,1,US,1.0,1.0,5000.00,2012.0,4.0,2012.0,3.0
4,Restaurants,Food,USD,50000.0,52375.0,1,224,US,52375.0,52375.0,50000.00,2016.0,4.0,2016.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331670,Small Batch,Food,USD,6500.0,154.0,0,4,US,0.0,154.0,6500.00,2017.0,4.0,2017.0,3.0
331671,Narrative Film,Film & Video,USD,1500.0,155.0,0,5,US,155.0,155.0,1500.00,2011.0,7.0,2011.0,6.0
331672,Narrative Film,Film & Video,USD,15000.0,20.0,0,1,US,20.0,20.0,15000.00,2010.0,8.0,2010.0,7.0
331673,Technology,Technology,USD,15000.0,200.0,0,6,US,200.0,200.0,15000.00,2016.0,2.0,2016.0,1.0


In [23]:
{column: list(df[column].unique()) for column in df.columns if df.dtypes[column]=='object'}

{'category': ['Poetry',
  'Narrative Film',
  'Music',
  'Restaurants',
  'Food',
  'Drinks',
  'Nonfiction',
  'Indie Rock',
  'Crafts',
  'Games',
  'Tabletop Games',
  'Design',
  'Comic Books',
  'Art Books',
  'Fashion',
  'Childrenswear',
  'Theater',
  'Comics',
  'DIY',
  'Webseries',
  'Animation',
  'Food Trucks',
  'Product Design',
  'Public Art',
  'Documentary',
  'Illustration',
  'Photography',
  'Pop',
  'People',
  'Art',
  'Family',
  'Fiction',
  'Film & Video',
  'Accessories',
  'Rock',
  'Hardware',
  'Software',
  'Weaving',
  'Web',
  'Jazz',
  'Ready-to-wear',
  'Festivals',
  'Video Games',
  'Anthologies',
  'Publishing',
  'Shorts',
  'Gadgets',
  'Electronic Music',
  'Radio & Podcasts',
  'Cookbooks',
  'Apparel',
  'Metal',
  'Comedy',
  'Hip-Hop',
  'Periodicals',
  'Dance',
  'Technology',
  'Painting',
  'World Music',
  'Photobooks',
  'Drama',
  'Architecture',
  'Young Adult',
  'Latin',
  'Mobile Games',
  'Flight',
  'Fine Art',
  'Action',
  'Pl

In [24]:
#creating function for onehot encode
def onehot_encode(df,columns,prefixes):
    df=df.copy()
    for column,prefix in zip(columns,prefixes):
        dummies=pd.get_dummies(df[column],prefix=prefix)
        df=pd.concat([df,dummies],axis=1)
        df=df.drop(column,axis=1)
    return df


In [25]:
#implementing the function
df=onehot_encode(
df,['category','main_category','currency','country'],
    ['cat','main_cat''curr','country']
)

In [26]:
df=df.drop('country',axis=1)
df

Unnamed: 0,goal,pledged,state,backers,usd pledged,usd_pledged_real,usd_goal_real,deadline_year,deadline_month,launched_year,...,country_EUR,country_GBP,country_HKD,country_JPY,country_MXN,country_NOK,country_NZD,country_SEK,country_SGD,country_USD
0,1000.0,0.0,0,0,0.0,0.0,1533.95,2015.0,10.0,2015.0,...,0,1,0,0,0,0,0,0,0,0
1,30000.0,2421.0,0,15,100.0,2421.0,30000.00,2017.0,11.0,2017.0,...,0,0,0,0,0,0,0,0,0,1
2,45000.0,220.0,0,3,220.0,220.0,45000.00,2013.0,2.0,2013.0,...,0,0,0,0,0,0,0,0,0,1
3,5000.0,1.0,0,1,1.0,1.0,5000.00,2012.0,4.0,2012.0,...,0,0,0,0,0,0,0,0,0,1
4,50000.0,52375.0,1,224,52375.0,52375.0,50000.00,2016.0,4.0,2016.0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331670,6500.0,154.0,0,4,0.0,154.0,6500.00,2017.0,4.0,2017.0,...,0,0,0,0,0,0,0,0,0,1
331671,1500.0,155.0,0,5,155.0,155.0,1500.00,2011.0,7.0,2011.0,...,0,0,0,0,0,0,0,0,0,1
331672,15000.0,20.0,0,1,20.0,20.0,15000.00,2010.0,8.0,2010.0,...,0,0,0,0,0,0,0,0,0,1
331673,15000.0,200.0,0,6,200.0,200.0,15000.00,2016.0,2.0,2016.0,...,0,0,0,0,0,0,0,0,0,1


# Splitting and Scaling the data

In [27]:
y=df.loc[:,'state']
x=df.drop('state',axis=1)

In [28]:
#scaling our  data
scaler=StandardScaler()
x=pd.DataFrame(scaler.fit_transform(x),columns=x.columns)


In [29]:
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7)

In [51]:
x_train.shape

(232172, 198)

In [31]:
y.mean()

0.4038772895153388

In [32]:
class_weights=class_weight.compute_class_weight(
    'balanced',y_train.unique(),
    y_train
)

class_weights=dict(enumerate(class_weights))
class_weights





132957    0
269914    0
196964    0
114257    0
         ..
221328    0
331067    1
91241     1
209079    0
14890     0
Name: state, Length: 232172, dtype: int64 as keyword args. From version 0.25 passing these as positional arguments will result in an error


{0: 1.2353648543668656, 1: 0.8399672944870951}

In [52]:
inputs=tf.keras.Input(shape=(198,))
x=tf.keras.layers.Dense(64,activation='relu')(inputs)
x=tf.keras.layers.Dense(64,activation='relu')(x)
outputs=tf.keras.layers.Dense(1,activation='sigmoid')(x)

model=tf.keras.Model(inputs,outputs)
model.compile(optimizer='adam',
             loss='binary_crossentropy',
              metrics=['accuracy',
                      tf.keras.metrics.AUC(name='auc')]
             )
batch_size=64
epochs=100
history=model.fit(x_train,y_train,validation_split=0.2,
                  class_weight=class_weights,
                  batch_size=batch_size,
                  epochs=epochs,
                  callbacks=[
                      tf.keras.callbacks.EarlyStopping(
                      monitor='val_loss',
                      patience=3,
                      restore_best_weights=True,
                      verbose=1)
                  ]
                 )


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Restoring model weights from the end of the best epoch.
Epoch 00014: early stopping


In [53]:
model.evaluate(x_test,y_test)

Exception ignored in: <function IteratorResourceDeleter.__del__ at 0x7fa73e5ab4d0>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/tensorflow/python/data/ops/iterator_ops.py", line 546, in __del__
    handle=self._handle, deleter=self._deleter)
  File "/opt/conda/lib/python3.7/site-packages/tensorflow/python/ops/gen_dataset_ops.py", line 1264, in delete_iterator
    _ctx, "DeleteIterator", name, handle, deleter)
KeyboardInterrupt: 




[0.17186249792575836, 0.9291679859161377, 0.9814707636833191]