In [43]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from category_encoders import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from xgboost import XGBClassifier

In [2]:
file_path = "D:\\Lambda_School\\unit_4\\bw_project\\ks-projects-201801.csv"

In [32]:
def wrangle(file_path):
    df = pd.read_csv(file_path)
    df = df.dropna()
    columns = ['main_category', 'launched', 'country', 'usd_pledged_real', 'state']
    df = df[columns]
    df['launched'] = df['launched'].str.split('-').apply(lambda x: x[1]).astype(int)
    df = df.drop(df[df['state'] == 'live'].index)

    mask = df['state'] == 'successful'
    df.loc[mask, 'target'] = 1
    df.loc[~mask, 'target'] = 0
    df['target'] = df['target'].astype(int)
    df = df.drop(columns=['state'])

    return df

In [33]:
df = wrangle(file_path)
df.head()

Unnamed: 0,main_category,launched,country,usd_pledged_real,target
0,Publishing,8,GB,0.0,0
1,Film & Video,9,US,2421.0,0
2,Film & Video,1,US,220.0,0
3,Music,3,US,1.0,0
4,Film & Video,7,US,1283.0,0


In [35]:
df['target'].value_counts(normalize=True)

0    0.640245
1    0.359755
Name: target, dtype: float64

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 372062 entries, 0 to 378660
Data columns (total 5 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   main_category     372062 non-null  object 
 1   launched          372062 non-null  int32  
 2   country           372062 non-null  object 
 3   usd_pledged_real  372062 non-null  float64
 4   target            372062 non-null  int32  
dtypes: float64(1), int32(2), object(2)
memory usage: 14.2+ MB


In [39]:
X = df.drop(columns=['target'])
y = df['target']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

In [45]:
# encoder = OrdinalEncoder()
# encoder.fit_transform(X_train)

In [47]:
lr = LogisticRegression(n_jobs=-2, random_state=42)
encoder = OrdinalEncoder()
model_lr = Pipeline([('encoder', encoder), ('clf', lr)])
model_lr.fit(X_train, y_train);

In [48]:
lr_training_accuracy = model_lr.score(X_train, y_train)
lr_val_accuracy = model_lr.score(X_val, y_val)

print('Training Accuracy Score:', lr_training_accuracy)
print('Validation Accuracy Score:', lr_val_accuracy)

Training Accuracy Score: 0.7627809051679085
Validation Accuracy Score: 0.7655279920445077


In [49]:
rf = RandomForestClassifier(n_jobs=-2, random_state=42, verbose=1)
encoder = OrdinalEncoder()
model_rf = Pipeline([('encoder', encoder), ('clf', rf)])
model_rf.fit(X_train, y_train);

[Parallel(n_jobs=-2)]: Using backend ThreadingBackend with 11 concurrent workers.
[Parallel(n_jobs=-2)]: Done  28 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-2)]: Done 100 out of 100 | elapsed:    9.4s finished


In [50]:
rf_training_accuracy = model_rf.score(X_train, y_train)
rf_val_accuracy = model_rf.score(X_val, y_val)

print('Training Accuracy Score:', rf_training_accuracy)
print('Validation Accuracy Score:', rf_val_accuracy)

[Parallel(n_jobs=11)]: Using backend ThreadingBackend with 11 concurrent workers.
[Parallel(n_jobs=11)]: Done  28 tasks      | elapsed:    0.6s
[Parallel(n_jobs=11)]: Done 100 out of 100 | elapsed:    1.9s finished
[Parallel(n_jobs=11)]: Using backend ThreadingBackend with 11 concurrent workers.
[Parallel(n_jobs=11)]: Done  28 tasks      | elapsed:    0.0s


Training Accuracy Score: 0.9801496169984023
Validation Accuracy Score: 0.8123740156422179


[Parallel(n_jobs=11)]: Done 100 out of 100 | elapsed:    0.1s finished


In [51]:
xgb = XGBClassifier(n_jobs=-2, random_state=42, verbose=1)
encoder = OrdinalEncoder()
model_xgb = Pipeline([('encoder', encoder), ('clf', xgb)])
model_xgb.fit(X_train, y_train);



Parameters: { "verbose" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [52]:
xgb_training_accuracy = model_xgb.score(X_train, y_train)
xgb_val_accuracy = model_xgb.score(X_val, y_val)

print('Training Accuracy Score:', xgb_training_accuracy)
print('Validation Accuracy Score:', xgb_val_accuracy)

Training Accuracy Score: 0.8566543727882218
Validation Accuracy Score: 0.8521514768726315


In [53]:
# Since the XGBoost model produced a higher validation score without tunning
# The hyperparamter tunning step will be focus on the XGB model

In [55]:
clf = XGBClassifier()
encoder = OrdinalEncoder()
model_pipe = Pipeline(
    [
        ('encoder', encoder), ('clf', clf)
    ]
)
parameters = {
    "clf__eta": [1e-1, 1e-2, 1e-3, 1e-4], 
    "clf__max_depth": [3, 6, 9], 
    "clf__sampling_method": ['uniform', 'gradient_based'], 
}
xgb_gs = GridSearchCV(model_pipe, parameters, n_jobs=-2, cv = 5, verbose = 1)

In [56]:
xgb_gs.fit(X, y)

Fitting 5 folds for each of 24 candidates, totalling 120 fits






GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('encoder', OrdinalEncoder()),
                                       ('clf',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      gamma=None, gpu_id=None,
                                                      importance_type='gain',
                                                      interaction_constraints=None,
                                                      learning_rate=None,
                                                      max_delta_step=None,
                                                      max_depth=None,
                                 

In [57]:
xgb_gs.best_score_

0.8520488550356278

In [59]:
xgb_gs.best_params_

{'clf__eta': 0.1, 'clf__max_depth': 9, 'clf__sampling_method': 'uniform'}

In [60]:
# Based on the result, change the param frid and rerun the search
parameters = {
    "clf__eta": [0.1, 0.2, 0.3], 
    "clf__max_depth": [10, 15, 20]
}
xgb_gs = GridSearchCV(model_pipe, parameters, n_jobs=-2, cv = 5, verbose = 1)

In [61]:
xgb_gs.fit(X, y)

Fitting 5 folds for each of 9 candidates, totalling 45 fits






GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('encoder', OrdinalEncoder()),
                                       ('clf',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      gamma=None, gpu_id=None,
                                                      importance_type='gain',
                                                      interaction_constraints=None,
                                                      learning_rate=None,
                                                      max_delta_step=None,
                                                      max_depth=None,
                                 

In [62]:
xgb_gs.best_score_

0.8518580256356344

In [63]:
xgb_gs.best_params_

{'clf__eta': 0.1, 'clf__max_depth': 10}

In [111]:
# test_df = pd.DataFrame({
#     'main_category': ['Games'], 
#     'launched': [12], 
#     'country': ['CA'], 
#     'usd_pledged_real': [4730.0]
# })
# xgb_gs.predict(test_df)

array([1])

In [110]:
# import joblib
# joblib.dump(xgb_gs, 'prediction_model.pkl')

['prediction_model.pkl']