In [None]:
# import the necessary libraries you need for your analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

from matplotlib.ticker import PercentFormatter
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import classification_report

In [None]:
# set general params
plt.rcParams.update({ "figure.figsize" : (8, 5),"axes.facecolor" : "white", "axes.edgecolor":  "black"})
plt.rcParams["figure.facecolor"]= "w"
pd.plotting.register_matplotlib_converters()
# Floats (decimal numbers) should be displayed rounded with 1 decimal place
pd.set_option('display.float_format', lambda x: '%.1f' % x)
# Set style for plots
plt.style.use('fivethirtyeight') 

In [None]:
df = pd.read_csv('data/2_data.csv')

In [None]:
df.columns

In [None]:
df.info()

In [None]:
data = df[['backers_count','goal_usd','usd_pledged','staff_pick','spotlight','state']]

In [None]:
data.head()

In [None]:
data.shape

In [None]:
sub = data.sample(17000)

In [None]:
sub.head()

In [None]:
sub.dtypes

In [None]:
data['usd_pledged'].describe()

In [None]:
# make categories for the quartiles s = 0-25% m = 26-50% l=51-75% xl = 76-100%
 

In [None]:
pledge_sizes = [
    (data['usd_pledged'] <= data['usd_pledged'].quantile(0.25)),
    ((data['usd_pledged'] > data['usd_pledged'].quantile(0.25)) & (data['usd_pledged'] <= data['usd_pledged'].quantile(0.5))),
    ((data['usd_pledged'] > data['usd_pledged'].quantile(0.5)) & (data['usd_pledged'] <= data['usd_pledged'].quantile(0.75))),
    ((data['usd_pledged'] > data['usd_pledged'].quantile(0.75)) & (data['usd_pledged'] <= data['usd_pledged'].quantile(0.9))),
    (data['usd_pledged'] > data['usd_pledged'].quantile(0.9))
]

In [None]:
# create a list of the values we want to assign for each condition
pledge_values_string = ['small', 'medium', 'large','xlarge','xxlarge']
pledge_values_int = [1,2,3,4,5]

In [None]:
# create a new column and use np.select to assign values to it using our lists as arguments
data['pledge_sizes_string'] = np.select(pledge_sizes, pledge_values_string)
data['pledge_sizes_int'] = np.select(pledge_sizes, pledge_values_int)

In [None]:
#convert to string
data = data.astype({'pledge_sizes_string': str})

# display updated DataFrame
data[['pledge_sizes_string','pledge_sizes_int']].head()

In [None]:
goal_sizes = [
    (data['goal_usd'] <= data['goal_usd'].quantile(0.25)),
    ((data['goal_usd'] > data['goal_usd'].quantile(0.25)) & (data['goal_usd'] <= data['goal_usd'].quantile(0.5))),
    ((data['goal_usd'] > data['goal_usd'].quantile(0.5)) & (data['goal_usd'] <= data['goal_usd'].quantile(0.75))),
    ((data['goal_usd'] > data['goal_usd'].quantile(0.75)) & (data['goal_usd'] <= data['goal_usd'].quantile(0.9))),
    (data['goal_usd'] > data['goal_usd'].quantile(0.9))
]

In [None]:
# create a list of the values we want to assign for each condition
goal_values_string = ['small', 'medium', 'large','xlarge','xxlarge']
goal_values_int = [1,2,3,4,5]

In [None]:
# create a new column and use np.select to assign values to it using our lists as arguments
data['goal_sizes_string'] = np.select(goal_sizes, goal_values_string)
data['goal_sizes_int'] = np.select(goal_sizes, goal_values_int)

In [None]:
#convert to string
data = data.astype({'goal_sizes_string': str})

In [None]:
sns.boxplot(data=data,x='usd_pledged')

In [None]:
sns.countplot(data=data,x='pledge_sizes_string',hue='state',order=['small','medium','large','xlarge','xxlarge'])

In [None]:
sns.boxplot(data=data,x='goal_usd')

In [None]:
data.head()

In [None]:
# something with rates
data['percent_reached'] = data['usd_pledged']/data['goal_usd']*100

In [None]:
data.head()

In [None]:
sns.scatterplot(data=data, x='goal_sizes_int',y='percent_reached',hue='state')

In [None]:
sns.countplot(data=data,x='goal_sizes_string',order=['small','medium','large','xlarge','xxlarge'])

In [None]:
sns.countplot(data=data,x='goal_sizes_string',hue='state',order=['small','medium','large','xlarge','xxlarge'])

In [None]:
data.head()

In [None]:
test = data.drop(['goal_usd','usd_pledged','goal_sizes_string','pledge_sizes_string','staff_pick','spotlight'],axis=1)

In [None]:
sns.pairplot(test,hue='state')

In [None]:
data_success = data[data['state']!='failed']

In [None]:
data_fail = data[data['state']=='failed']

In [None]:
data_success.groupby('goal_sizes_string').count().reset_index()

In [None]:
data_success.groupby('goal_sizes_string').median().reset_index()

In [None]:
data_fail.groupby('goal_sizes_string').median()

In [None]:
count_suc = data_success.groupby('goal_sizes_string').count().reset_index()
count_fail = data_fail.groupby('goal_sizes_string').count().reset_index()

In [None]:
data_success.describe()

In [None]:
count_suc

In [None]:
rel_suc_med = count_suc.query("goal_sizes_string == ['small','medium']")

In [None]:
rel_suc_med_fail = count_fail.query("goal_sizes_string == ['small','medium']")

In [None]:
sum(rel_suc_med['state'])/(sum(rel_suc_med['state'])+sum(rel_suc_med_fail['state']))

In [None]:
sum(rel_suc_med['state'])/(count_suc['percent_reached']+count_fail['percent_reached'])*100

In [None]:
rel_suc = pd.DataFrame(count_suc['percent_reached']/(count_suc['percent_reached']+count_fail['percent_reached'])*100)

In [None]:
rel_suc['goal_sizes_string']=count_suc['goal_sizes_string']

In [None]:
rel_suc

In [None]:
sns.barplot(data=rel_suc,x='goal_sizes_string',y='percent_reached',order=['small','medium','large','xlarge','xxlarge'])

In [None]:
data_success.groupby('goal_sizes_string').max().reset_index()

In [None]:
rel_suc

In [None]:
sns.heatmap(data.corr(),annot=True)

In [None]:
data.head()

In [None]:
#drop spotlight, percent reached, pledge_sizes, goal_sizes
data_w = data.drop(['pledge_sizes_string','spotlight','goal_sizes_string','pledge_sizes_int'],axis=1)


In [None]:
data_w.head()

In [None]:
data_w['staff_pick'] = data_w['staff_pick'].astype('category')
data_w['goal_sizes_int'] = data_w['goal_sizes_int'].astype('category')


In [None]:
data_w.info()

In [None]:
data_w.head()

In [None]:
df_mod = data_w.drop(['backers_count','usd_pledged','staff_pick'],axis=1)

In [None]:
df_mod

In [None]:
# onehot encode staff_pick
df_mod['staff_pick'] = pd.get_dummies(df_mod['staff_pick'], drop_first=True)

In [None]:
df_mod['goal_sizes_int'] = pd.get_dummies(data['goal_sizes_int'], drop_first=True)

In [None]:
df_mod.head()

In [None]:
# Defining target and predictors 
X = df_mod.drop('state',axis=1)
y = df_mod['state']

In [None]:
# Train-test-split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [None]:
# logistic regression
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train, y_train)
y_pred = logistic_regression.predict(X_test)

In [None]:
# Confusion matrix using confusion_matrix from sklearn
cfm = metrics.confusion_matrix(y_test, y_pred)
sns.heatmap(cfm, cmap='YlGnBu', annot=True, fmt='d', linewidths=.5);

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
# knn

In [None]:
# 