In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.preprocessing import LabelEncoder

ModuleNotFoundError: No module named 'matplotlib'

In [2]:
data = pd.read_csv('./result.csv')

In [3]:
 display_labels = ['hospitalized', 'nonhospitalized', 'recovered', 'deceased']

### Label Encoding

In [4]:
labelencoder = LabelEncoder()
data['sex'] = labelencoder.fit_transform(data['sex'])
data['province'] = labelencoder.fit_transform(data['province'])
data['country'] = labelencoder.fit_transform(data['country'])
data['date_confirmation'] = labelencoder.fit_transform(data['date_confirmation'])

### Split dataset with ratio 0.8 : 0.2

In [5]:
training_data, validation_data = np.split(data.sample(frac=1),
                                        [int(0.8 * len(data))])
x_columns = [x for x in training_data.columns if x!='outcome']
y_train = training_data['outcome']
y_validation = validation_data['outcome']
X_train = training_data[x_columns]
X_validation = validation_data[x_columns]
# X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=['outcome']), data['outcome'], test_size = 0.2)

### Random Forest

In [6]:
from sklearn.ensemble import RandomForestClassifier

In [7]:
rf = RandomForestClassifier(n_estimators = 100, max_depth=30)

In [8]:
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=30)

In [9]:
predict = rf.predict(X_validation)

In [10]:
accuracy_score(y_validation, predict)

0.8701220348279172

In [11]:
# now you can save it to a file
with open('RandomForest.pkl', 'wb') as f:
    pickle.dump(rf, f)

# and later you can load it
#with open('filename.pkl', 'rb') as f:
    #clf = pickle.load(f)

###  Evalution of RF

In [12]:
rfModel = None
with open('./RandomForest.pkl', 'rb') as f:
    rfModel = pickle.load(f)

In [13]:
train_predict = rfModel.predict(X_train)

In [14]:
accuracy_score(y_train, train_predict)

0.8865110379816262

In [15]:
valid_predict = rf.predict(X_validation)

In [16]:
accuracy_score(y_validation,valid_predict)

0.8701220348279172

### LightGBM 

In [17]:
early_stop_rounds = 10
def print_all_score(y_true, y_pred):
    print('Accuracy score:', accuracy_score(y_true, y_pred))
    print('Precision score:', precision_score(y_true, y_pred, average = 'weighted'))
    print('Recall score:', recall_score(y_true, y_pred, average = 'weighted'))
    print('Kappa score:', cohen_kappa_score(y_true, y_pred))
#     for label, score in zip(f1_score(y_true, y_pred, average='weighted', label = display_labels))
#         print('Fisher score of ' ,label, ":" ,score )

SyntaxError: invalid syntax (<ipython-input-17-bf5262481c6b>, line 7)

In [None]:
gbdt_lgb = LGBMClassifier(boosting_type = 'gbdt', num_leaves = 200, n_estimators=200).fit(X_train, y_train)
dart_lgb = LGBMClassifier(boosting_type = 'dart', num_leaves = 200,n_estimators=200).fit(X_train, y_train)
goss_lgb = LGBMClassifier(boosting_type = 'goss', num_leaves = 200,n_estimators=200).fit(X_train, y_train)

In [None]:
gbdt_train_predictions = gbdt_lgb.predict(X_train)
gbdt_test_predictions = gbdt_lgb.predict(X_validation)
dart_train_predictions = dart_lgb.predict(X_train)
dart_test_predictions = dart_lgb.predict(X_validation)
goss_train_predictions = goss_lgb.predict(X_train)
goss_test_predictions = goss_lgb.predict(X_validation)

In [None]:
print_all_score(y_train, gbdt_train_predictions)

In [None]:
print_all_score(y_validation, gbdt_test_predictions)

In [None]:
# now you can save it to a file
# with open('LGBM.pkl', 'wb') as f:
#     pickle.dump(lgb, f)

# and later you can load it
# with open('filename.pkl', 'rb') as f:
#     clf = pickle.load(f)

In [None]:
print_all_score(y_train, dart_train_predictions)

In [None]:
print_all_score(y_validation, dart_test_predictions)

In [None]:
print_all_score(y_train, goss_train_predictions)

In [None]:
print_all_score(y_validation, goss_test_predictions)

## Confusion Matrix for Training Set

In [None]:
disp = plot_confusion_matrix(gbdt_lgb, X_train, y_train, display_labels = display_labels, xticks_rotation = 10)
disp.ax_.set_title('LGB Confusion Matrix for Training Set')
plt.tight_layout()
# plt.savefig('plots/lgb_cm_train.png')

## Confusion Matrix for Validation Set

In [None]:
disp = plot_confusion_matrix(gbdt_lgb, X_validation, y_validation, display_labels = display_labels, xticks_rotation = 10)
disp.ax_.set_title('LGB Confusion Matrix for Training Set')
plt.tight_layout()
# plt.savefig('plots/lgb_cm_train.png')