In [21]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [2]:
# Data has no none entry so it is not necessary to drop any rows
# Transform categorical string columns to numerical values and keep a dictionary to map them back
def transform_strings_to_numerical(data):
    """
    This function transforms all string values in the dataframe to numerical values using the LabelEncoder from sklearn.
    Args:
        data : Our dataframe which we want to modify

    Returns:
        data : Our modified dataframe
        transform_data : A dictionary containing the mapping from the original string values to the numerical values
    """

    transform_data = {}
    for column in data.columns:
        # If data type is an object, for example a string, we want to convert the column to numerical values
        if data[column].dtype == 'object' and data[column].dtype != 'datetime64[ns]':
            le = LabelEncoder()
            data[column] = le.fit_transform(data[column])
            # Save the mapping in a dictionary
            transform_data[column] = dict(zip(le.classes_, le.transform(le.classes_)))
    return data, transform_data

In [3]:
def transform_numerical_to_string(data,transform_data):
    """
    This function transforms all numerical values in the dataframe back to the original string values using the LabelEncoder from sklearn.
    Args:
        data : Our dataframe which we want to modify
        transform_data : A dictionary containing the mapping from the original string values to the numerical values

    Returns:
        data : Our modified dataframe
    """
    for column, mapping in transform_data.items():
        #print(f"Mapping for column {column}: {mapping}")
        #if one of the mapped columns is not in the data anymore, we skip it
        if column not in data.columns:
            continue
        # need to reverse the mapping to map back to the original string values
        # this simply swaps the keys and values in the dictionary
        reverse_mapping = {v: k for k, v in mapping.items()}
        # now we can convert back to the original string values
        data[column] = data[column].map(reverse_mapping)
    return data

In [18]:
def get_data():
    """
    Returns the data from the csv file and transforms the categorical values to numerical values
    """
    # read in the data from the csv file
    data = pd.read_csv('data/kickstarter_projects.csv')
    # transform the categorical values to numerical values

    data = data.drop(["ID","Name"],axis=1)
    data = data[(data["State"] == "Successful") | (data["State"] == "Failed")]
    data["Deadline"] = pd.to_datetime(data["Deadline"],format='%Y-%m-%d')
    data["Launched"] = pd.to_datetime(data["Launched"],format='%Y-%m-%d %H:%M:%S')
    data["Duration"] = (data["Deadline"] - data["Launched"]).dt.days


    data, transform_data = transform_strings_to_numerical(data)



    """
    We have converted Deadline and Launched to DateTime objects and calculated the duration in days
    We also, at least for now, drop all live or suspended or canceled projects
    
    """
    #return the data and the transformation_data in case we want to transform the data back
    return data, transform_data

In [5]:
def get_original_data():
    """
    Returns the original data without any modifications
    """
    # read in the data from the csv file
    data = pd.read_csv('data/kickstarter_projects.csv')

    data = data.drop(["ID","Name"],axis=1)
    data = data[(data["State"] == "Successful") | (data["State"] == "Failed")]
    data["Deadline"] = pd.to_datetime(data["Deadline"],format='%Y-%m-%d')
    data["Launched"] = pd.to_datetime(data["Launched"],format='%Y-%m-%d %H:%M:%S')
    data["Duration"] = (data["Deadline"] - data["Launched"]).dt.days



    """
    We have converted Deadline and Launched to DateTime objects and calculated the duration in days
    We also, at least for now, drop all live or suspended or canceled projects
    
    """
    #return the data
    return data

In [6]:
def remove_outlier(data,columns,threshold=3):
    """
    This function removes outliers from the data based on the threshold
    Args:
        data : Our dataframe which we want to modify
        columns : The columns which we want to check for outliers
        threshold : The threshold which we use to determine if a value is an outlier
        Multiplied by the standard deviation of the column to determine the range of values which are not outliers
        I advise setting the threshold to 3

    Returns:
        data : Our modified dataframe
    """
    
    for column in columns:
        data = data[np.abs(data[column]-data[column].mean()) <= (threshold*data[column].std())]
    return data

In [14]:
def logistic_regression(X_train,X_test,y_train,y_test,metric="accuracy",verbose=False,norm="l2",max_iter=1000,C=1.0):
    from sklearn.metrics import confusion_matrix
    """
    This function performs logistic regression on the data and returns the accuracy of the model
    Necessary Arguments:
        X_train : The training data
        X_test : The test data
        y : The target values
        y_train : The target values for the training data

    Optional Arguments:
        metric : The metric to calculate the model performance, Options: "accuracy", "precision", "recall", "f1"
        verbose : If True, the function will print the metric of the model
        norm : The norm to use for the logistic regression
        max_iter : The maximum number of iterations for the logistic regression
        C : The regularization parameter for the logistic regression

    Returns:
        metric_value : The number of the metric specified in the arguments
    """

    # create a logistic regression model
    model = LogisticRegression(max_iter=max_iter,penalty=norm,C=C)

    # fit the model to the training data
    model.fit(X_train, y_train)

    # predict the target values for the test data
    y_pred = model.predict(X_test)

    if verbose==True:
        print(f"Model Coefficients: {model.coef_}")
        print(f"Model Intercept: {model.intercept_}")
        print(f"Model Score: {y_pred}")
        print(f'Confusion Matrix: {confusion_matrix(y_test, y_pred)}')


    metric_value = 0
    # calculate the metric of the model
    if metric == "accuracy":
        metric_value = accuracy_score(y_test, y_pred)
    if metric == "precision":
        metric_value = precision_score(y_test, y_pred)
    if metric == "recall":
        metric_value = recall_score(y_test, y_pred)
    if metric == "f1":
        metric_value = f1_score(y_test, y_pred)
    return metric_value

In [8]:
from sklearn.model_selection import train_test_split
# test out the functions
data,transform_data = get_data()
print(data.head())

y = data['State']
X = data.drop(['State','Launched','Deadline'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 1337)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

   Category  Subcategory  Country            Launched   Deadline   Goal   
0         5           52       21 2009-04-21 21:02:48 2009-05-31   1000  \
1         6          129       21 2009-04-23 00:07:53 2009-07-20  80000   
2         0           70       21 2009-04-24 21:52:03 2009-05-03     20   
3        13          131       21 2009-04-25 17:36:21 2009-07-14     99   
4         5           52       21 2009-04-27 14:10:39 2009-05-26   1900   

   Pledged  Backers  State  Duration  
0      625       30      0        39  
1       22        3      0        87  
2       35        3      1         8  
3      145       25      1        79  
4      387       10      0        28  


((265169, 7), (66293, 7), (265169,), (66293,))

In [9]:

def grid_search(model,parameters,metric,X_train,X_test,y_train,y_test):
    from itertools import product
    """
    
    Perform grid search for the given machine learning model and hyperparameters to find the best hyperparameters

    Parameters:
    model - The machine learning model function to use
    param_grid - The hyperparameters to test, given in form of a list of dictionaries
    metric - A string specifying the metric to use for evaluation
    X_train, X_test, y_train, y_test - The training and test data

    Returns: A pandas Dataframe containing the hyperparameters and the corresponding metric value, 
    sorted by the metric value in descending order

    """
    # Create all possible permutations of the hyperparameters, so if a={1,2} and b={3,4} we get [{1,3},{1,4},{2,3},{2,4}]
    keys, values = zip(*parameters.items())
    permutations = [dict(zip(keys, v)) for v in product(*values)]

    # Create a list to store the results
    results = []

    for params in permutations:
        # feed the model with the hyperparameters
        # ** unpacks the dictionary into the form dict[key]=value -> key = value
        metric_value = model(X_train,X_test,y_train,y_test,**params)

        # Append the results to the list
        results.append((params, metric_value))

    # After the loop is done, we sort the results by the metric value
    results.sort(key=lambda x: x[1], reverse=True)

    results = pd.DataFrame(results, columns=['Parameters', metric])

    return results

    

In [None]:
#import warnings
#
# warnings.filterwarnings("ignore", category=ConvergenceWarning)

hyperparameters = {"eta":[0.1,0.3,1],"max_depth":[3,6,12,24]}
results = grid_search(xgb,hyperparameters,"f1",X_train,X_test,y_train,y_test)
results.head()

Unnamed: 0,Parameters,f1
0,"{'eta': 0.3, 'max_depth': 3}",0.999261
1,"{'eta': 0.3, 'max_depth': 6}",0.999261
2,"{'eta': 0.3, 'max_depth': 12}",0.999261
3,"{'eta': 0.3, 'max_depth': 24}",0.999261
4,"{'eta': 1, 'max_depth': 3}",0.999125


In [11]:
X_train.head()

Unnamed: 0,Category,Subcategory,Country,Goal,Pledged,Backers,Duration
164774,3,99,21,3500,3501,19,29
74178,10,90,20,320,567,27,29
296198,13,138,21,250000,275,2,29
92665,6,129,21,7000,528,23,38
191647,12,95,21,2000,80,3,14


In [12]:
hyperparameters = {"C":[0.1,1,10],"max_iter":[100,500,1000]}
results = grid_search(logistic_regression,hyperparameters,"f1",X_train,X_test,y_train,y_test)
results.head()


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,Parameters,f1
0,"{'C': 0.1, 'max_iter': 100}",0.999155
1,"{'C': 0.1, 'max_iter': 500}",0.999155
2,"{'C': 0.1, 'max_iter': 1000}",0.999155
3,"{'C': 1, 'max_iter': 100}",0.999155
4,"{'C': 1, 'max_iter': 500}",0.999155


In [16]:
logistic_regression(X_train,X_test,y_train,y_test,metric="f1",verbose=True,C=0.1,max_iter=500)

Model Coefficients: [[ 0.02520122  0.00417009  0.04143354 -0.10483769  0.10516741  0.08317194
   0.00119811]]
Model Intercept: [0.00554605]
Model Score: [0 0 0 ... 0 1 0]
Confusion Matrix: [[39351    56]
 [    0 26886]]


0.9989596492531768

In [None]:
def decision_tree(X_train,X_test,y_train,y_test,metric="accuracy",verbose=False):
    from sklearn.tree import DecisionTreeClassifier
    """
    This function performs logistic regression on the data and returns the accuracy of the model
    Necessary Arguments:
        X_train : The training data
        X_test : The test data
        y_train : The target values
        y_test : The target values for the training data

    Optional Arguments:
        metric : The metric to calculate the model performance, Options: "accuracy", "precision", "recall", "f1"
        verbose : If True, the function will print the metric of the model

    Returns:
        metric_value : The number of the metric specified in the arguments
    """

    # create a decision tree model
    model = DecisionTreeClassifier()

    # fit the model to the training data
    model.fit(X_train, y_train)

    # predict the target values for the test data
    y_pred = model.predict(X_test)

    if verbose==True:
        print(f"Model Score: {y_pred}")
        cm = confusion_matrix(y_test, y_pred)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=data.target_names)
        disp.plot()

    metric_value = 0
    # calculate the metric of the model
    if metric == "accuracy":
        metric_value = accuracy_score(y_test, y_pred)
    if metric == "precision":
        metric_value = precision_score(y_test, y_pred)
    if metric == "recall":
        metric_value = recall_score(y_test, y_pred)
    if metric == "f1":
        metric_value = f1_score(y_test, y_pred)
    return metric_value

In [13]:
def xgb(X_train,X_test,y_train,y_test,metric="accuracy",booster="gbtree",eta=0.3,max_depth=6, reg_lambda=1, verbose=False):

    """
    This function performs xgboost on the data and returns the accuracy of the model
    Necessary Arguments:
        X_train : The training data
        X_test : The test data
        y_train : The target values
        y_test : The target values for the training data

    Optional Arguments:
        metric : The metric to calculate the model performance, Options: "accuracy", "precision", "recall", "f1"
        verbose : If True, the function will print the metric of the model
        booster : The type of booster to use, Options: "gbtree", "gblinear", "dart"
        eta : The learning rate of the model, between [0,1]
        max_depth : The maximum depth of the trees, default is 6 to avoid overfitting
        reg_lambda : The regularization parameter of the model

    Returns:
        metric_value : The number of the metric specified in the arguments
    """
    from xgboost import XGBClassifier
    # create a logistic regression model
    model = XGBClassifier(booster=booster,eta=eta)

    # fit the model to the training data
    model.fit(X_train, y_train)

    # predict the target values for the test data
    y_pred = model.predict(X_test)

    if verbose==True:
        print(f"Model Score: {y_pred}")
        cm = confusion_matrix(y_test, y_pred)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=data.target_names)
        disp.plot()
        

    metric_value = 0
    # calculate the metric of the model
    if metric == "accuracy":
        metric_value = accuracy_score(y_test, y_pred)
    if metric == "precision":
        metric_value = precision_score(y_test, y_pred)
    if metric == "recall":
        metric_value = recall_score(y_test, y_pred)
    if metric == "f1":
        metric_value = f1_score(y_test, y_pred)
    return metric_value

In [25]:
data,transform_data = get_data()
data.head()
data.groupby("State").count()

y = data['State']
X = data.drop(['State','Launched','Deadline'], axis=1)

In [None]:
data = pd.read_csv('data/kickstarter_projects.csv')


data = data.drop(["ID","Name"],axis=1)
data = data[(data["State"] == "Successful") | (data["State"] == "Failed")]
data["Deadline"] = pd.to_datetime(data["Deadline"],format='%Y-%m-%d')
data["Launched"] = pd.to_datetime(data["Launched"],format='%Y-%m-%d %H:%M:%S')
data["Duration"] = (data["Deadline"] - data["Launched"]).dt.days

data, transform_data = transform_strings_to_numerical(data)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 1337)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

model = LogisticRegression(max_iter=1000,penalty="l2",C=0.1)
# fit the model to the training data
model.fit(X_train, y_train)

# predict the target values for the test data
y_pred = model.predict(X_test)

print(f"Model Coefficients: {model.coef_}")
print(f"Model Intercept: {model.intercept_}")
print(f"Model Score: {y_pred}")
print(f'Confusion Matrix: {confusion_matrix(y_test, y_pred)}')

metric_value = f1_score(y_test, y_pred)
print(metric_value)

Model Coefficients: [[ 0.02520122  0.00417009  0.04143354 -0.10483769  0.10516741  0.08317194
   0.00119811]]
Model Intercept: [0.00554605]
Model Score: [0 0 0 ... 0 1 0]
Confusion Matrix: [[39351    56]
 [    0 26886]]
0.9989596492531768
