In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import statsmodels.api as sm


In [3]:
df = pd.read_csv("../data/kickstarter_projects.csv")

df = df[df['State'].isin(['Successful', 'Failed'])].copy()

df['Success'] = (df['State'] == 'Successful').astype(int)

display(df['Success'].value_counts())

Success
0    197611
1    133851
Name: count, dtype: int64

In [4]:
country_stats = df.groupby('Country')['Success'].agg(['count', 'mean']).sort_values('count', ascending=False)
country_stats.rename(columns={'count': 'n_projects', 'mean': 'success_rate'}, inplace=True)

display(country_stats.head(20))


Unnamed: 0_level_0,n_projects,success_rate
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
United States,261358,0.418196
United Kingdom,29453,0.409704
Canada,12370,0.334196
Australia,6616,0.303809
Germany,3436,0.272701
France,2520,0.360317
Netherlands,2411,0.25591
Italy,2369,0.18531
Spain,1873,0.26268
Sweden,1509,0.337309


In [5]:
min_projects = 400
country_counts = df['Country'].value_counts()

major_countries = country_counts[country_counts >= min_projects].index

df['Country_simplified'] = df['Country'].where(df['Country'].isin(major_countries), 'Other')

display(df['Country_simplified'].value_counts())


Country_simplified
United States     261358
United Kingdom     29453
Canada             12370
Australia           6616
Germany             3436
France              2520
Netherlands         2411
Italy               2369
Spain               1873
Sweden              1509
Mexico              1411
New Zealand         1274
Denmark              926
Ireland              683
Switzerland          652
Norway               582
Belgium              523
Austria              485
Hong Kong            477
Singapore            454
Other                 80
Name: count, dtype: int64

In [53]:
# one-hot code
country_dummies = pd.get_dummies(df['Country_simplified'], prefix='Country', drop_first=True)
country_dummies = country_dummies.astype(int)

display(country_dummies.head())
display(df['Success'].dtype)
display(country_dummies.dtypes)

Unnamed: 0,Country_Austria,Country_Belgium,Country_Canada,Country_Denmark,Country_France,Country_Germany,Country_Hong Kong,Country_Ireland,Country_Italy,Country_Mexico,Country_Netherlands,Country_New Zealand,Country_Norway,Country_Other,Country_Singapore,Country_Spain,Country_Sweden,Country_Switzerland,Country_United Kingdom,Country_United States
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


dtype('int32')

Country_Austria           int32
Country_Belgium           int32
Country_Canada            int32
Country_Denmark           int32
Country_France            int32
Country_Germany           int32
Country_Hong Kong         int32
Country_Ireland           int32
Country_Italy             int32
Country_Mexico            int32
Country_Netherlands       int32
Country_New Zealand       int32
Country_Norway            int32
Country_Other             int32
Country_Singapore         int32
Country_Spain             int32
Country_Sweden            int32
Country_Switzerland       int32
Country_United Kingdom    int32
Country_United States     int32
dtype: object

In [54]:
# x: dummies for all countries
X_country = country_dummies

# add constant
X_country = sm.add_constant(X_country)

# Yï¼šSuccess
y = df['Success']

# fit the model
model_country = sm.Logit(y, X_country).fit()

display(model_country.summary())
  

Optimization terminated successfully.
         Current function value: 0.671035
         Iterations 5


0,1,2,3
Dep. Variable:,Success,No. Observations:,331462.0
Model:,Logit,Df Residuals:,331441.0
Method:,MLE,Df Model:,20.0
Date:,"Tue, 02 Dec 2025",Pseudo R-squ.:,0.005182
Time:,21:10:25,Log-Likelihood:,-222420.0
converged:,True,LL-Null:,-223580.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.8292,0.027,-31.019,0.000,-0.882,-0.777
Country_Austria,-0.4328,0.113,-3.840,0.000,-0.654,-0.212
Country_Belgium,-0.0631,0.100,-0.631,0.528,-0.259,0.133
Country_Canada,0.1400,0.033,4.263,0.000,0.076,0.204
Country_Denmark,0.3767,0.073,5.195,0.000,0.235,0.519
Country_France,0.2552,0.049,5.171,0.000,0.158,0.352
Country_Germany,-0.1517,0.047,-3.248,0.001,-0.243,-0.060
Country_Hong Kong,0.6400,0.096,6.681,0.000,0.452,0.828
Country_Ireland,-0.0035,0.087,-0.040,0.968,-0.175,0.168


In [None]:
# multiple variable regression
# transform to datetime
df['Launched_dt'] = pd.to_datetime(df['Launched'])
df['Deadline_dt'] = pd.to_datetime(df['Deadline'])

df['Launched_year'] = df['Launched_dt'].dt.year
df['duration_days'] = (df['Deadline_dt'] - df['Launched_dt']).dt.days

# log the goals
df['log_goal'] = np.log1p(df['Goal'])
df['log_backers'] = np.log1p(df['Backers'])

num_vars = df[['log_goal', 'log_backers', 'durationdays']]

# category varible
# Category dummy
cat_dummies = pd.get_dummies(df['Category'], prefix='Cat',drop_first=True)

# Subcategory dummy
subcat_dummies = pd.get_dummies(df['Subcategory'], prefix='Subcat', drop_first=True)

cat_dummies = cat_dummies.astype(int)
subcat_dummies = subcat_dummies.astype(int)


In [None]:
X_big = pd.concat([country_dummies,cat_dummies,subcat_dummies, num_vars], axis=1)

X_big = sm.add_constant(X_big)

y = df['Success']

model_big = 