In [113]:
#importing library
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
import tensorflow as tf

In [114]:
#loading the dataset
df=pd.read_csv('../input/kickstarter-projects/ks-projects-201801.csv')
#showing the dataset
df

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.00
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.00
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.00
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378656,999976400,ChknTruk Nationwide Charity Drive 2014 (Canceled),Documentary,Film & Video,USD,2014-10-17,50000.0,2014-09-17 02:35:30,25.0,canceled,1,US,25.0,25.0,50000.00
378657,999977640,The Tribe,Narrative Film,Film & Video,USD,2011-07-19,1500.0,2011-06-22 03:35:14,155.0,failed,5,US,155.0,155.0,1500.00
378658,999986353,Walls of Remedy- New lesbian Romantic Comedy f...,Narrative Film,Film & Video,USD,2010-08-16,15000.0,2010-07-01 19:40:30,20.0,failed,1,US,20.0,20.0,15000.00
378659,999987933,BioDefense Education Kit,Technology,Technology,USD,2016-02-13,15000.0,2016-01-13 18:13:53,200.0,failed,6,US,200.0,200.0,15000.00


In [115]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378661 entries, 0 to 378660
Data columns (total 15 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   ID                378661 non-null  int64  
 1   name              378657 non-null  object 
 2   category          378661 non-null  object 
 3   main_category     378661 non-null  object 
 4   currency          378661 non-null  object 
 5   deadline          378661 non-null  object 
 6   goal              378661 non-null  float64
 7   launched          378661 non-null  object 
 8   pledged           378661 non-null  float64
 9   state             378661 non-null  object 
 10  backers           378661 non-null  int64  
 11  country           378661 non-null  object 
 12  usd pledged       374864 non-null  float64
 13  usd_pledged_real  378661 non-null  float64
 14  usd_goal_real     378661 non-null  float64
dtypes: float64(5), int64(2), object(8)
memory usage: 43.3+ MB


In [116]:
#Cleaning the Preprocessing

In [117]:
#dropping the unneeded columns
unneeded_columns=['ID','name']

In [118]:
df=df.drop(unneeded_columns,axis=1)

In [119]:
df.isna().sum()

category               0
main_category          0
currency               0
deadline               0
goal                   0
launched               0
pledged                0
state                  0
backers                0
country                0
usd pledged         3797
usd_pledged_real       0
usd_goal_real          0
dtype: int64

In [120]:
#filling missing value with mean of that columns
df['usd pledged']=df['usd pledged'].fillna(df['usd pledged'].mean())

In [121]:
df.isna().sum()

category            0
main_category       0
currency            0
deadline            0
goal                0
launched            0
pledged             0
state               0
backers             0
country             0
usd pledged         0
usd_pledged_real    0
usd_goal_real       0
dtype: int64

In [122]:
df['state'].unique()

array(['failed', 'canceled', 'successful', 'live', 'undefined',
       'suspended'], dtype=object)

In [123]:
#dropping the anything other than failed and successful 
df=df.drop(df.query('state!="failed" and state!="successful"').index,axis=0).reset_index(drop=True)

# # Feature Engineering and Encoding

In [124]:
df

Unnamed: 0,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.00
2,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.00
3,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.00
4,Restaurants,Food,USD,2016-04-01,50000.0,2016-02-26 13:38:27,52375.0,successful,224,US,52375.0,52375.0,50000.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
331670,Small Batch,Food,USD,2017-04-19,6500.0,2017-03-20 22:08:22,154.0,failed,4,US,0.0,154.0,6500.00
331671,Narrative Film,Film & Video,USD,2011-07-19,1500.0,2011-06-22 03:35:14,155.0,failed,5,US,155.0,155.0,1500.00
331672,Narrative Film,Film & Video,USD,2010-08-16,15000.0,2010-07-01 19:40:30,20.0,failed,1,US,20.0,20.0,15000.00
331673,Technology,Technology,USD,2016-02-13,15000.0,2016-01-13 18:13:53,200.0,failed,6,US,200.0,200.0,15000.00


In [125]:
df['deadline_year']=df['deadline'].apply(lambda x:np.float(x[0:4]))
df['deadline_month']=df['deadline'].apply(lambda x:np.float(x[5:7]))
df['launched_year']=df['launched'].apply(lambda x:np.float(x[0:4]))
df['launched_month']=df['launched'].apply(lambda x:np.float(x[5:7]))

df=df.drop(['deadline','launched'],axis=1)






Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  """Entry point for launching an IPython kernel.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  This is separate from the ipykernel package so we can avoid doing imports until
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  after removing the cwd from sys.path.


In [126]:
df

Unnamed: 0,category,main_category,currency,goal,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,deadline_year,deadline_month,launched_year,launched_month
0,Poetry,Publishing,GBP,1000.0,0.0,failed,0,GB,0.0,0.0,1533.95,2015.0,10.0,2015.0,8.0
1,Narrative Film,Film & Video,USD,30000.0,2421.0,failed,15,US,100.0,2421.0,30000.00,2017.0,11.0,2017.0,9.0
2,Narrative Film,Film & Video,USD,45000.0,220.0,failed,3,US,220.0,220.0,45000.00,2013.0,2.0,2013.0,1.0
3,Music,Music,USD,5000.0,1.0,failed,1,US,1.0,1.0,5000.00,2012.0,4.0,2012.0,3.0
4,Restaurants,Food,USD,50000.0,52375.0,successful,224,US,52375.0,52375.0,50000.00,2016.0,4.0,2016.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331670,Small Batch,Food,USD,6500.0,154.0,failed,4,US,0.0,154.0,6500.00,2017.0,4.0,2017.0,3.0
331671,Narrative Film,Film & Video,USD,1500.0,155.0,failed,5,US,155.0,155.0,1500.00,2011.0,7.0,2011.0,6.0
331672,Narrative Film,Film & Video,USD,15000.0,20.0,failed,1,US,20.0,20.0,15000.00,2010.0,8.0,2010.0,7.0
331673,Technology,Technology,USD,15000.0,200.0,failed,6,US,200.0,200.0,15000.00,2016.0,2.0,2016.0,1.0


In [127]:
df['state']=df['state'].apply(lambda x:1 if x=='successful' else 0)

In [128]:
df['state']

0         0
1         0
2         0
3         0
4         1
         ..
331670    0
331671    0
331672    0
331673    0
331674    0
Name: state, Length: 331675, dtype: int64

In [129]:
df

Unnamed: 0,category,main_category,currency,goal,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,deadline_year,deadline_month,launched_year,launched_month
0,Poetry,Publishing,GBP,1000.0,0.0,0,0,GB,0.0,0.0,1533.95,2015.0,10.0,2015.0,8.0
1,Narrative Film,Film & Video,USD,30000.0,2421.0,0,15,US,100.0,2421.0,30000.00,2017.0,11.0,2017.0,9.0
2,Narrative Film,Film & Video,USD,45000.0,220.0,0,3,US,220.0,220.0,45000.00,2013.0,2.0,2013.0,1.0
3,Music,Music,USD,5000.0,1.0,0,1,US,1.0,1.0,5000.00,2012.0,4.0,2012.0,3.0
4,Restaurants,Food,USD,50000.0,52375.0,1,224,US,52375.0,52375.0,50000.00,2016.0,4.0,2016.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331670,Small Batch,Food,USD,6500.0,154.0,0,4,US,0.0,154.0,6500.00,2017.0,4.0,2017.0,3.0
331671,Narrative Film,Film & Video,USD,1500.0,155.0,0,5,US,155.0,155.0,1500.00,2011.0,7.0,2011.0,6.0
331672,Narrative Film,Film & Video,USD,15000.0,20.0,0,1,US,20.0,20.0,15000.00,2010.0,8.0,2010.0,7.0
331673,Technology,Technology,USD,15000.0,200.0,0,6,US,200.0,200.0,15000.00,2016.0,2.0,2016.0,1.0


In [130]:
{column:list(df[column].unique()) for column in df.columns if df.dtypes[column]=='object'}

{'category': ['Poetry',
  'Narrative Film',
  'Music',
  'Restaurants',
  'Food',
  'Drinks',
  'Nonfiction',
  'Indie Rock',
  'Crafts',
  'Games',
  'Tabletop Games',
  'Design',
  'Comic Books',
  'Art Books',
  'Fashion',
  'Childrenswear',
  'Theater',
  'Comics',
  'DIY',
  'Webseries',
  'Animation',
  'Food Trucks',
  'Product Design',
  'Public Art',
  'Documentary',
  'Illustration',
  'Photography',
  'Pop',
  'People',
  'Art',
  'Family',
  'Fiction',
  'Film & Video',
  'Accessories',
  'Rock',
  'Hardware',
  'Software',
  'Weaving',
  'Web',
  'Jazz',
  'Ready-to-wear',
  'Festivals',
  'Video Games',
  'Anthologies',
  'Publishing',
  'Shorts',
  'Gadgets',
  'Electronic Music',
  'Radio & Podcasts',
  'Cookbooks',
  'Apparel',
  'Metal',
  'Comedy',
  'Hip-Hop',
  'Periodicals',
  'Dance',
  'Technology',
  'Painting',
  'World Music',
  'Photobooks',
  'Drama',
  'Architecture',
  'Young Adult',
  'Latin',
  'Mobile Games',
  'Flight',
  'Fine Art',
  'Action',
  'Pl

In [131]:
def onehot_encode(df,columns,prefixes):
    df=df.copy()
    for column,prefix in zip(columns,prefixes):
        dummies=pd.get_dummies(df[column],prefix=prefix)
        df=pd.concat([df,dummies],axis=1)
        df=df.drop(column,axis=1)
    return df

In [132]:
df=onehot_encode(
    df,
    ['category','main_category','currency','country'],
    ['cat','main_cat','curr','country'])

In [133]:
df

Unnamed: 0,goal,pledged,state,backers,usd pledged,usd_pledged_real,usd_goal_real,deadline_year,deadline_month,launched_year,...,country_JP,country_LU,country_MX,"country_N,0""",country_NL,country_NO,country_NZ,country_SE,country_SG,country_US
0,1000.0,0.0,0,0,0.0,0.0,1533.95,2015.0,10.0,2015.0,...,0,0,0,0,0,0,0,0,0,0
1,30000.0,2421.0,0,15,100.0,2421.0,30000.00,2017.0,11.0,2017.0,...,0,0,0,0,0,0,0,0,0,1
2,45000.0,220.0,0,3,220.0,220.0,45000.00,2013.0,2.0,2013.0,...,0,0,0,0,0,0,0,0,0,1
3,5000.0,1.0,0,1,1.0,1.0,5000.00,2012.0,4.0,2012.0,...,0,0,0,0,0,0,0,0,0,1
4,50000.0,52375.0,1,224,52375.0,52375.0,50000.00,2016.0,4.0,2016.0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331670,6500.0,154.0,0,4,0.0,154.0,6500.00,2017.0,4.0,2017.0,...,0,0,0,0,0,0,0,0,0,1
331671,1500.0,155.0,0,5,155.0,155.0,1500.00,2011.0,7.0,2011.0,...,0,0,0,0,0,0,0,0,0,1
331672,15000.0,20.0,0,1,20.0,20.0,15000.00,2010.0,8.0,2010.0,...,0,0,0,0,0,0,0,0,0,1
331673,15000.0,200.0,0,6,200.0,200.0,15000.00,2016.0,2.0,2016.0,...,0,0,0,0,0,0,0,0,0,1


# #Splitting and Scaling

In [134]:
y=df.loc[:,'state']
x=df.drop('state',axis=1)

In [135]:
scaler=StandardScaler()
x=scaler.fit_transform(x)

In [136]:
#spliting and scaling the dataset
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,random_state=1)

# Modeling and Training

In [137]:
x.shape

(331675, 221)

In [138]:
y.mean()

0.4038772895153388

In [139]:
inputs=tf.keras.Input(shape=(221,))
x=tf.keras.layers.Dense(64,activation='relu')(inputs)
x=tf.keras.layers.Dense(64,activation='relu')(x)
outputs=tf.keras.layers.Dense(1,activation='sigmoid')(x)
model=tf.keras.Model(inputs,outputs)
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=[
        'accuracy',tf.keras.metrics.AUC(name='auc')
    ])
batch_size=64,
epochs=100


In [144]:
history=model.fit(
    x_train,
    y_train,
    validation_split=0.2,
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=3,
        restore_best_weights=True,
        verbose=1
        )
    ]
)

TypeError: unsupported operand type(s) for /: 'int' and 'tuple'