In [1]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense
import random
import numpy as np
import statistics

In [2]:
data = pd.read_csv("../Data/Cleaned/from_kickstarter_reduced.csv")

In [3]:
data.describe()

Unnamed: 0.1,Unnamed: 0,id,backers_count,goal,usd_pledged,static_usd_rate,year,month,day,hour,days_to_deadline,goal_USD,blurb_length,binary_state
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
mean,2041.47575,1075254000.0,152.2323,35160.41,12678.63,1.007821,2015.5654,6.21475,15.1083,0.0,34.26565,29915.67,18.92425,0.40495
std,1179.364453,620746000.0,1291.975501,448602.8,89672.84,0.232205,2.16923,3.383387,8.876334,0.0,12.348583,439305.9,5.040167,0.490895
min,0.0,39235.0,0.0,1.0,0.0,0.008786,2009.0,1.0,1.0,0.0,1.0,0.7356343,1.0,0.0
25%,1014.0,535719100.0,2.0,2000.0,53.27022,1.0,2014.0,3.0,7.0,0.0,30.0,2000.0,16.0,0.0
50%,2032.0,1076134000.0,17.0,5000.0,923.9855,1.0,2016.0,6.0,15.0,0.0,30.0,5000.0,20.0,0.0
75%,3055.25,1613053000.0,70.0,16607.75,5125.75,1.0,2017.0,9.0,23.0,0.0,38.0,15000.0,22.0,1.0
max,4095.0,2147431000.0,102203.0,50000000.0,4148577.0,1.716408,2019.0,12.0,31.0,0.0,92.0,50000000.0,36.0,1.0


In [4]:
num_to_delete = data["binary_state"].value_counts()[0] - data["binary_state"].value_counts()[1]
failed_indices = data[data["binary_state"] == 0].index
i_failed_indices = random.sample(range(0, failed_indices.shape[0]), num_to_delete)
indices_to_drop = []
for index in i_failed_indices:
    indices_to_drop.append(failed_indices[index])

In [5]:
data = data.drop(indices_to_drop)
data.shape

(16198, 30)

In [6]:
data["binary_state"].describe()

count    16198.000000
mean         0.500000
std          0.500015
min          0.000000
25%          0.000000
50%          0.500000
75%          1.000000
max          1.000000
Name: binary_state, dtype: float64

In [7]:
data.columns

Index(['Unnamed: 0', 'id', 'backers_count', 'blurb', 'currency', 'goal',
       'launched_at', 'deadline', 'location.country', 'name', 'usd_pledged',
       'slug', 'spotlight', 'staff_pick', 'static_usd_rate', 'state', 'year',
       'month', 'day', 'hour', 'days_to_deadline', 'goal_USD', 'category_name',
       'category_slug', 'blurb_length', 'location_type', 'location_country',
       'location_state', 'location_displayable_name', 'binary_state'],
      dtype='object')

In [8]:
data.drop(['Unnamed: 0', 'id', 'day', 'hour', 'blurb', 'currency', 'launched_at', 'deadline', 'name', 'slug', 'state', 'location_displayable_name', 'location.country', 'location_state'], axis = 1, inplace = True)

In [9]:
data.columns

Index(['backers_count', 'goal', 'usd_pledged', 'spotlight', 'staff_pick',
       'static_usd_rate', 'year', 'month', 'days_to_deadline', 'goal_USD',
       'category_name', 'category_slug', 'blurb_length', 'location_type',
       'location_country', 'binary_state'],
      dtype='object')

In [10]:
categorical_cols = ['location_country', 'category_name', 'category_slug', 'location_type']
binary_categorical = ['spotlish', 'staff_pick']

In [11]:
for cat in categorical_cols:
    print(cat, len(data[cat].unique()))

location_country 118
category_name 159
category_slug 15
location_type 8


In [12]:
data.drop(['location_country', 'category_name'], axis = 1, inplace = True)

In [13]:
data['spotlight'] = data['spotlight'].astype(int)
data['staff_pick'] = data['staff_pick'].astype(int)
data.dtypes

backers_count         int64
goal                float64
usd_pledged         float64
spotlight             int32
staff_pick            int32
static_usd_rate     float64
year                  int64
month                 int64
days_to_deadline      int64
goal_USD            float64
category_slug        object
blurb_length          int64
location_type        object
binary_state          int64
dtype: object

In [14]:
data = pd.get_dummies(data)
data.columns

Index(['backers_count', 'goal', 'usd_pledged', 'spotlight', 'staff_pick',
       'static_usd_rate', 'year', 'month', 'days_to_deadline', 'goal_USD',
       'blurb_length', 'binary_state', 'category_slug_art',
       'category_slug_comics', 'category_slug_crafts', 'category_slug_dance',
       'category_slug_design', 'category_slug_fashion',
       'category_slug_film & video', 'category_slug_food',
       'category_slug_games', 'category_slug_journalism',
       'category_slug_music', 'category_slug_photography',
       'category_slug_publishing', 'category_slug_technology',
       'category_slug_theater', 'location_type_Country',
       'location_type_County', 'location_type_Island',
       'location_type_LocalAdmin', 'location_type_Miscellaneous',
       'location_type_Suburb', 'location_type_Town', 'location_type_Zip'],
      dtype='object')

In [15]:
data.shape

(16198, 35)

In [16]:
df_y = data['binary_state']
df_y

0        0
1        0
3        0
4        0
5        1
        ..
19993    1
19994    1
19995    1
19997    0
19999    1
Name: binary_state, Length: 16198, dtype: int64

In [17]:
data.drop(['binary_state'], axis = 1, inplace = True)
data.shape

(16198, 34)

In [18]:
X = data.to_numpy()
y = df_y.to_numpy()

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [20]:
model = tf.keras.models.Sequential([
  tf.keras.layers.Dense(34, input_dim = 34, activation = 'relu'),
  tf.keras.layers.Dense(1, activation = 'sigmoid')
])
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

Instructions for updating:
Colocations handled automatically by placer.


In [21]:
hist = model.fit(X_train, y_train, epochs = 10, verbose = 1)

Instructions for updating:
Use tf.cast instead.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [22]:
model.evaluate(X_test, y_test)



[0.2142515224491747, 0.9851852]

In [35]:
model = tf.keras.models.Sequential([
  tf.keras.layers.Dense(34, input_dim = 34, activation = 'relu'),
  tf.keras.layers.Dense(1, activation = 'sigmoid')
])
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [36]:
def my_cross_val(model, X, y, k):
    foldsX = np.array_split(X, k)
    foldsY = np.array_split(y, k)

    acc = []

    for i in range(0, k):
        testX = foldsX[i]
        testY = foldsY[i]
        train_arrays_X = np.delete(foldsX, i)
        train_arrays_Y = np.delete(foldsY, i)

        trainX = train_arrays_X[0]
        trainY = train_arrays_Y[0]
        for i in range (1, k-1):
            trainX = np.concatenate((trainX, train_arrays_X[i]))
            trainY = np.concatenate((trainY, train_arrays_Y[i]))

        model.fit(trainX, trainY, epochs = 2, verbose = 1)
        ret = model.evaluate(testX, testY)
        acc.append(ret[1])
    mean = 0
    for a in acc:
        mean += a
    mean /= k
    return {"acc": acc, "mean": mean}

In [37]:
ret = my_cross_val(model, X, y, 5)

Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2


In [38]:
ret

{'acc': [0.97253084, 0.9771605, 0.97283953, 0.9756098, 0.9845631],
 'mean': 0.9765407562255859}