In [1]:
import pandas as pd
import numpy as np
import import_ipynb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split

In [2]:
import base
# When having any question about any of the functions in base, you can either look at the
# commented code in base.ipynb or use the help function

importing Jupyter notebook from base.ipynb


In [14]:
# For example
help(base.xgb)

Help on function xgb in module base:

xgb(X_train, X_test, y_train, y_test, metric='accuracy', booster='gbtree', eta=0.3, max_depth=6, reg_lambda=1, verbose=False)
    This function performs xgboost on the data and returns the accuracy of the model
    Necessary Arguments:
        X_train : The training data
        X_test : The test data
        y_train : The target values
        y_test : The target values for the training data
    
    Optional Arguments:
        metric : The metric to calculate the model performance, Options: "accuracy", "precision", "recall", "f1"
        verbose : If True, the function will print the metric of the model
        booster : The type of booster to use, Options: "gbtree", "gblinear", "dart"
        eta : The learning rate of the model, between [0,1]
        max_depth : The maximum depth of the trees, default is 6 to avoid overfitting
        reg_lambda : The regularization parameter of the model
    
    Returns:
        metric_value : The number of t

In [7]:
#This is what the code would look like without using any functions

#read in the csv file
data = pd.read_csv('data/kickstarter_projects.csv')

#drop the ID and Name columns, only take the rows where the state is either successful or failed
data = data.drop(["ID","Name"],axis=1)
data = data[(data["State"] == "Successful") | (data["State"] == "Failed")]

#convert the deadline and launched columns to datetime, calculate the duration of the project
#and add it as a new column
data["Deadline"] = pd.to_datetime(data["Deadline"],format='%Y-%m-%d')
data["Launched"] = pd.to_datetime(data["Launched"],format='%Y-%m-%d %H:%M:%S')
data["Duration"] = (data["Deadline"] - data["Launched"]).dt.days

#transform all the string columns to numerical
for column in data.columns:
        if data[column].dtype == 'object' and data[column].dtype != 'datetime64[ns]':
            le = LabelEncoder()
            data[column] = le.fit_transform(data[column])

#split the data into the features and the target and than do train test split
y = data['State']
# We left in some columns in the Data which might be useful for EDA which we are removing here
X = data.drop(['State','Launched','Deadline','Pledged'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=75)

# create a logistic regression model
model = LogisticRegression(max_iter=1000,penalty="l2",C=0.1)
# fit the model to the training data
model.fit(X_train, y_train)
# predict the target values for the test data
y_pred = model.predict(X_test)

# calculate the precision, recall, f1 score or accuracy
precision = precision_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)

# print the results
print("Precision: ", precision)
print("Confusion Matrix:\n ", confusion)

#These are only for logistic regression, not for the other models
print(f"Model Coefficients: {model.coef_}")
print(f"Model Intercept: {model.intercept_}")

Precision:  0.9291990537343697
Confusion Matrix:
  [[37771  1676]
 [ 4850 21996]]
Model Coefficients: [[-0.01076887 -0.0009772  -0.00749205 -0.00023625  0.0545101  -0.01675955]]
Model Intercept: [-0.00265765]


In [15]:
#This is the code using functions
data,transform_data = base.get_data()
y = data['State']
# We left in some columns in the Data which might be useful for EDA which we are removing here
X = data.drop(['State','Launched','Deadline',"Pledged"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 152)

f1 = base.logistic_regression(X_train,X_test,y_train,y_test,metric="f1", verbose=True)

Model Coefficients: [[-0.01210504 -0.00098262 -0.00667038 -0.00023508  0.05487012 -0.01718795]]
Model Intercept: [-0.00382842]
Model Score: [1 0 0 ... 1 0 0]
Confusion Matrix: [[37735  1759]
 [ 4716 22083]]


In [16]:
#Making Changes to the model or choosing a different model is as easy as using a different function
#in the last line of the code
#This code now uses xgboost instead of logistic regression
#This is the code using functions
data,transform_data = base.get_data()
y = data['State']
X = data.drop(['State','Launched','Deadline',"Pledged"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 152)

f1 = base.xgb(X_train,X_test,y_train,y_test,metric="f1", verbose=True)

Model Score: [1 0 0 ... 1 0 0]
Confusion Matrix
: [[37047  2447]
 [ 1951 24848]]


In [None]:
#Using this we were able to get the following scores for our data, using the hyperparameters which are pre-set

# decision tree: 87.6% f1score, 90.1% accuracy, 88% precision, 87,5% recall
# logistic regression: 87.2% f1score, 90.2% accuracy, 92,6% precision, 82.4% recall 
# xgboost 92% f1score, 93,3% accuracy, 91% precision, 92,7% recall

In [11]:
# Another function is the grid search function. You can declare a set of hyperparameters like below
# The grid search function will then try all the combinations of the hyperparameters and return the results
# for each combination
hyperparameters = {"eta":[0.1,0.3,1],"max_depth":[3,6,12,24]}

# You have to give the grid_search function the name of the function to use, for example here base.xgb, so xgboost
results = base.grid_search(base.xgb,hyperparameters,"f1",X_train,X_test,y_train,y_test)
# It will return a Dataframe including all necessary informations
results.head()

Unnamed: 0,Parameters,f1
0,"{'eta': 0.3, 'max_depth': 3}",0.933658
1,"{'eta': 0.3, 'max_depth': 6}",0.933658
2,"{'eta': 0.3, 'max_depth': 12}",0.933658
3,"{'eta': 0.3, 'max_depth': 24}",0.933658
4,"{'eta': 0.1, 'max_depth': 3}",0.932829


In [None]:
#hyperparameters = {"C":[0.1,1,10],"max_iter":[100,500,1000]}
#results = grid_search(logistic_regression,hyperparameters,"f1",X_train,X_test,y_train,y_test)
#results.head()

In [21]:
# decision tree: 87.6% f1score, 90.1% accuracy, 88% precision, 87,5% recall
# logistic regression: 87.2% f1score, 90.2% accuracy, 92,6% precision, 82.4% recall 
# xgboost 92% f1score, 93,3% accuracy, 91% precision, 92,7% recall

Model Score: [1 0 0 ... 0 0 1]
Confusion Matrix
: [[36288  3206]
 [ 3363 23436]]
0.874510242919512
