In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split

data = pd.read_csv('data/kickstarter_projects.csv')

data = data.drop(["ID","Name"],axis=1)
data = data[(data["State"] == "Successful") | (data["State"] == "Failed")]
data["Deadline"] = pd.to_datetime(data["Deadline"],format='%Y-%m-%d')
data["Launched"] = pd.to_datetime(data["Launched"],format='%Y-%m-%d %H:%M:%S')
data["Duration"] = (data["Deadline"] - data["Launched"]).dt.days

for column in data.columns:
        # If data type is an object, for example a string, we want to convert the column to numerical values
        if data[column].dtype == 'object' and data[column].dtype != 'datetime64[ns]':
            le = LabelEncoder()
            data[column] = le.fit_transform(data[column])

#data, transform_data = transform_strings_to_numerical(data)

In [10]:
y = data['State']
X = data.drop(['State','Launched','Deadline'], axis=1)

In [11]:
X.head(10)

Unnamed: 0,Category,Subcategory,Country,Goal,Pledged,Backers,Duration
0,5,52,21,1000,625,30,39
1,6,129,21,80000,22,3,87
2,0,70,21,20,35,3,8
3,13,131,21,99,145,25,79
4,5,52,21,1900,387,10,28
5,9,77,21,3000,3329,110,17
6,6,129,21,200,41,3,29
7,12,54,21,500,563,18,29
9,10,125,21,300,15,2,16
10,11,104,21,350,1630,31,48


In [12]:
y.head(10)

0     0
1     0
2     1
3     1
4     0
5     1
6     0
7     1
9     0
10    1
Name: State, dtype: int32

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((265169, 7), (66293, 7), (265169,), (66293,))

In [19]:
X_train.head()

Unnamed: 0,Category,Subcategory,Country,Goal,Pledged,Backers,Duration
322439,10,125,21,1500,1825,39,34
334349,4,37,21,1100,6027,419,6
219576,4,113,0,116629,1622,9,59
128525,12,95,21,5000,5050,31,29
13392,12,95,21,5000,0,0,44


In [20]:
X_test.head()

Unnamed: 0,Category,Subcategory,Country,Goal,Pledged,Backers,Duration
214728,12,19,13,5582,1743,27,39
224796,4,65,21,700,60,2,39
158299,13,131,21,800,57,4,29
329487,10,47,21,8000,9179,108,27
162625,0,98,21,25000,51,3,44


In [21]:


model = LogisticRegression(max_iter=1000,penalty="l2",C=0.1)
# fit the model to the training data
model.fit(X_train, y_train)

# predict the target values for the test data
y_pred = model.predict(X_test)

In [22]:
print(f"Model Coefficients: {model.coef_}")
print(f"Model Intercept: {model.intercept_}")
print(f"Model Score: {y_pred}")
print(f'Confusion Matrix: {confusion_matrix(y_test, y_pred)}')

metric_value = f1_score(y_test, y_pred)
print(metric_value)

Model Coefficients: [[ 0.01684149  0.0032329   0.03237224 -0.1055174   0.10587912  0.07303369
  -0.00594841]]
Model Intercept: [0.58553995]
Model Score: [0 0 0 ... 1 0 0]
Confusion Matrix: [[39342    76]
 [    0 26875]]
0.9985880429532197
