In [None]:
# import the necessary libraries you need for your analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix






RSEED = 42

Reading in the previously prepared data

In [None]:
df = pd.read_csv('data/2_data.csv')

Checking on columns

In [None]:
df.columns

defining columns to work with, dropping rest basically

In [None]:

#df.drop(['Unnamed: 0','Unnamed: 0.1','currency_symbol','disable_communication', 'friends', 'fx_rate','current_currency','usd_pledged', 'usd_type'], axis=1 , inplace=True)
#df2 = df[['staff_pick','category_main','category_sub','duration','description_length','name_length']]
df2 = df[['id','country','backers_count','converted_pledged_amount','staff_pick','category_main','category_sub','duration','description_length','name_length','state','goal_usd']]
df2.set_index('id',inplace=True)

Checking for empty values

In [None]:
#df.head(), df.describe()
df2.isna().sum()

Filling empty descriptions with 0

In [None]:
df2.description_length.fillna(0, inplace=True)

building a useless boxplot

In [None]:
sns.boxplot(data=df2)

Checking various stats of our data by main categories

In [None]:
# Creating a dataframe grouped by category with columns for failed and successful
cat_df = pd.get_dummies(df2.set_index('category_main').state).groupby('category_main').sum()

# Plotting
fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2, figsize=(12,12))

color = cm.CMRmap(np.linspace(0.1,0.6,df.category_main.nunique()))

df.groupby('category_main').category_main.count().plot(kind='bar', ax=ax1, color=color)
ax1.set_title('Amount of project by category')
ax1.set_xlabel('')

df.groupby('category_main').goal_usd.median().plot(kind='bar', ax=ax2, color=color)
ax2.set_title('Median project goal ($)')
ax2.set_xlabel('')

df.groupby('category_main').converted_pledged_amount.median().plot(kind='bar', ax=ax3, color=color)
ax3.set_title('Median pledged per project ($)')
ax3.set_xlabel('')

cat_df.div(cat_df.sum(axis=1), axis=0).successful.plot(kind='bar', ax=ax4, color=color) # Normalizes counts across rows
ax4.set_title('Proportion of successful projects')
ax4.set_xlabel('')

df.groupby('category_main').backers_count.median().plot(kind='bar', ax=ax5, color=color)
ax5.set_title('Median backers per project')
ax5.set_xlabel('')

df.groupby('category_main').backers_count.mean().plot(kind='bar', ax=ax6, color=color)
ax6.set_title('Mean backers per project')
ax6.set_xlabel('')

fig.subplots_adjust(hspace=0.6)
plt.show()

dropping potential data leakage from the data and not used data

In [None]:
#df2.columns
df_clean = df2.drop(['backers_count','converted_pledged_amount','duration',],axis=1)
df_clean

building a simple log reg model only with main categories and state

In [None]:
#building a simple model before applying all data above

df_cm = df2[['state','category_main']]
df_cm['state'] = df_cm['state'].replace({'failed': 0, 'successful': 1})
df_cm = pd.get_dummies(df_cm)
y_simple = df_cm['state']
X_simple = df_cm.drop('state',axis=1)
#no need to scale data, only 0 and 1
X_train_simple, X_test_simple, y_train_simple, y_test_simple = train_test_split(X_simple, y_simple, stratify = y_simple, random_state=RSEED)

Lgsimple = LogisticRegression()
Lgsimple.fit(X_train_simple,y_train_simple)
y_p_simple = Lgsimple.predict(X_test_simple)

print("Performance on test set: ",Lgsimple.score(X_test_simple, y_test_simple))
print(classification_report(y_test_simple,y_p_simple))

checking subcategories' value counts

In [None]:
df_clean.category_sub.value_counts()
#dropping subcategory > too many dummy columns
df_clean = df_clean.drop('category_sub',axis=1)
df_clean

transforming state to 0 & 1 and getting dummy variables for other data

In [None]:
df_clean['state'] = df_clean['state'].replace({'failed': 0, 'successful': 1})
df_clean = pd.get_dummies(df_clean)
df_clean

Defining our target dataframe and features dataframe, performing a train-test-split

In [None]:


y = df_clean["state"]
X = df_clean.drop("state", axis=1)

scaling = StandardScaler()
X_scaled = scaling.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns = list(X.columns))
X_scaled.head()

In [None]:
# create test and train data set
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, stratify = y, random_state=RSEED)

In [None]:
# simple logistic regression model



LogReg = LogisticRegression()
LogReg.fit(X_train, y_train)

In [None]:
# predicting and printing classification report


y_pred = LogReg.predict(X_test)

print("Performance on test set: ",LogReg.score(X_test, y_test))

print(classification_report(y_test,y_pred))

In [None]:
#confusion matrix

confusion_matrix(y_test, y_pred)

In [None]:
#Trying more complex model with Decision tree

tree = DecisionTreeClassifier()
tree.fit(X_train,y_train)
y_pred_tree = tree.predict(X_test)

In [None]:
print("Performance on test set: ",tree.score(X_test, y_test))

print(classification_report(y_test,y_pred_tree))
print(confusion_matrix(y_test, y_pred_tree))

Without optimization Logistic Regression performs better overall. Recall is worse for DCT than for LG.