In [64]:
import util
from util import load_config, pickle_dump, pickle_load
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

In [34]:
def load_dataset(config: dict):
    x_train = pickle_load(config["train_clean_set_path"][0])
    y_train = pickle_load(config["train_clean_set_path"][1])

    x_valid = pickle_load(config["valid_clean_set_path"][0])
    y_valid = pickle_load(config["valid_clean_set_path"][1])

    x_test = pickle_load(config["test_clean_set_path"][0])
    y_test = pickle_load(config["test_clean_set_path"][1])

    return x_train, y_train, x_valid, y_valid, x_test, y_test

In [54]:
def train_model(x_train, y_train, x_valid, y_valid, x_test, y_test):
    random_forest = RandomForestClassifier(random_state=123)
    random_forest.fit(x_train, y_train)
    
    y_pred_train = random_forest.predict(x_valid)
    print(classification_report(y_valid, y_pred_train))
    
    y_pred_test = random_forest.predict(x_test)
    print(classification_report(y_test, y_pred_test))
    
    return random_forest

In [65]:
logreg = LogisticRegression(solver = "liblinear",
                            random_state=123)

In [66]:
logreg.fit(x_train, y_train)

LogisticRegression(random_state=123, solver='liblinear')

In [67]:
y_pred_valid_logreg = logreg.predict(x_valid)
print(classification_report(y_valid, y_pred_valid_logreg))

              precision    recall  f1-score   support

           0       0.54      0.03      0.05      2544
           1       0.54      0.98      0.70      2997

    accuracy                           0.54      5541
   macro avg       0.54      0.50      0.38      5541
weighted avg       0.54      0.54      0.40      5541



In [69]:
y_pred_test_logreg = logreg.predict(x_test)
print(classification_report(y_test, y_pred_test_logreg))

              precision    recall  f1-score   support

           0       0.52      0.03      0.05      2544
           1       0.54      0.98      0.70      2999

    accuracy                           0.54      5543
   macro avg       0.53      0.50      0.38      5543
weighted avg       0.53      0.54      0.40      5543



In [59]:
y_pred_valid = random_forest.predict(x_valid)
print(classification_report(y_valid, y_pred_valid))

              precision    recall  f1-score   support

           0       0.46      0.33      0.39      2544
           1       0.54      0.67      0.60      2997

    accuracy                           0.51      5541
   macro avg       0.50      0.50      0.49      5541
weighted avg       0.50      0.51      0.50      5541



In [62]:
y_pred_test = random_forest.predict(x_test)
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.47      0.34      0.39      2544
           1       0.55      0.68      0.61      2999

    accuracy                           0.52      5543
   macro avg       0.51      0.51      0.50      5543
weighted avg       0.51      0.52      0.51      5543



In [36]:
config = util.load_config()

In [37]:
x_train, y_train, x_valid, y_valid, x_test, y_test = load_dataset(config)

In [38]:
x_train.head()

Unnamed: 0,age,days_since_last_login,points_in_wallet,gender_F,gender_M,region_category_City,region_category_KOSONG,region_category_Town,region_category_Village,membership_category_Basic Membership,...,complaint_status_Unsolved,feedback_No reason specified,feedback_Poor Customer Service,feedback_Poor Product Quality,feedback_Poor Website,feedback_Products always in Stock,feedback_Quality Customer Care,feedback_Reasonable Price,feedback_Too many ads,feedback_User Friendly Website
0,-1.206558,-0.512906,0.268487,-0.997489,0.997489,-0.721973,-0.41431,1.270711,-0.383816,-0.509485,...,-0.379359,-0.451308,-0.452177,-0.457319,-0.453975,-0.197244,-0.195117,-0.199666,2.234875,-0.198511
1,0.116142,1.698067,-2.474832,-0.997489,0.997489,-0.721973,2.413653,-0.786961,-0.383816,-0.509485,...,-0.379359,-0.451308,-0.452177,-0.457319,2.202762,-0.197244,-0.195117,-0.199666,-0.447452,-0.198511
2,-0.198787,-0.14441,-0.686999,1.002518,-1.002518,1.385093,-0.41431,-0.786961,-0.383816,-0.509485,...,2.636029,-0.451308,2.211524,-0.457319,-0.453975,-0.197244,-0.195117,-0.199666,-0.447452,-0.198511
3,1.186899,0.961076,0.144788,-0.997489,0.997489,-0.721973,-0.41431,-0.786961,2.605415,1.962766,...,-0.379359,-0.451308,-0.452177,-0.457319,-0.453975,-0.197244,-0.195117,-0.199666,2.234875,-0.198511
4,-0.89163,-0.512906,-0.108614,1.002518,-1.002518,-0.721973,-0.41431,-0.786961,2.605415,1.962766,...,-0.379359,-0.451308,2.211524,-0.457319,-0.453975,-0.197244,-0.195117,-0.199666,-0.447452,-0.198511


In [39]:
x_train.shape, y_train.shape, x_valid.shape, y_valid.shape, x_test.shape, y_test.shape

((25849, 49), (25849,), (5541, 49), (5541,), (5543, 49), (5543,))

In [40]:
random_forest = train_model(x_train, y_train, x_valid, y_valid, x_test, y_test)

              precision    recall  f1-score   support

           0       1.00      0.00      0.00      2544
           1       0.54      1.00      0.70      2997

    accuracy                           0.54      5541
   macro avg       0.77      0.50      0.35      5541
weighted avg       0.75      0.54      0.38      5541

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      2544
           1       0.54      1.00      0.70      2999

    accuracy                           0.54      5543
   macro avg       0.27      0.50      0.35      5543
weighted avg       0.29      0.54      0.38      5543



In [31]:
util.pickle_dump(random_forest, config["production_model_path"])

['../models/production_model.pkl']