In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, recall_score, roc_auc_score, accuracy_score, classification_report, average_precision_score, confusion_matrix, ConfusionMatrixDisplay

In [2]:
#Loading dataset
df = pd.read_csv('final_data.csv')
#Identify Columns
df.head()

Unnamed: 0,sex,patient_type,pneumonia,age,pregnancy,diabetes,copd,asthma,inmsupr,hypertension,other_disease,cardiovascular,obesity,renal_chronic,tobacco,contact_other_covid,covid_res
0,2,1,2,27,2,2,2,2,2,2,2,2,2,2,2,2,1
1,2,1,2,24,2,2,2,2,2,2,2,2,2,2,2,1,1
2,1,2,2,54,2,2,2,2,2,2,2,2,1,2,2,2,1
3,2,2,1,30,2,2,2,2,2,2,2,2,2,2,2,2,1
4,1,2,2,60,2,1,2,2,2,1,2,1,2,2,2,1,1


In [3]:
df.shape

(496291, 17)

In [4]:
features=['sex', 'patient_type', 'pneumonia', 'age', 'pregnancy',
       'diabetes', 'copd', 'asthma', 'inmsupr', 'hypertension',
       'other_disease', 'cardiovascular', 'obesity', 'renal_chronic',
       'tobacco', 'contact_other_covid']

X = df[features]
y = df['covid_res']
X = X.values
y = np.array(y)

# Chia tập dữ liệu thành tập huấn luyện và tập kiểm tra
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Size of X_train is:{}\nSize of Y_train is:{}\nSize of X_test is:{}\nSize of Y_test is:{}\n".format(X_train.shape, y_train.shape, X_test.shape, y_test.shape))

Size of X_train is:(397032, 16)
Size of Y_train is:(397032,)
Size of X_test is:(99259, 16)
Size of Y_test is:(99259,)



In [5]:
#KNN
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score

# Chuẩn hóa dữ liệu
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Huấn luyện mô hình KNN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

# Dự đoán trên tập kiểm tra
y_pred = knn.predict(X_test_scaled)

# Đánh giá hiệu suất mô hình
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.5743459031422843
Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.64      0.62     55380
           1       0.52      0.50      0.51     43879

    accuracy                           0.57     99259
   macro avg       0.57      0.57      0.57     99259
weighted avg       0.57      0.57      0.57     99259



In [6]:
#Random Forest
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

le = LabelEncoder()
for column in df.select_dtypes(include=['object', 'category']).columns:
    df[column] = le.fit_transform(df[column])

# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.6234195387823774
Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.84      0.71     55380
           1       0.63      0.35      0.45     43879

    accuracy                           0.62     99259
   macro avg       0.63      0.60      0.58     99259
weighted avg       0.63      0.62      0.60     99259



In [10]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
LR_model=LogisticRegression(C=0.03, solver='liblinear')
LR_model.fit(X_train,y_train)
y_pred = LR_model.predict(X_test)
y_pred_prob = LR_model.predict_proba(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.6290637288746135
Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.84      0.72     55744
           1       0.64      0.36      0.46     44195

    accuracy                           0.63     99939
   macro avg       0.63      0.60      0.59     99939
weighted avg       0.63      0.63      0.60     99939



In [11]:
#Decision tree
from sklearn.tree import DecisionTreeClassifier
for i in range(1,25):
    DT_model = DecisionTreeClassifier(criterion="entropy", max_depth = i)
    DT_model.fit(X_train,y_train)
    y_pred = DT_model.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.6260618977576321
Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.82      0.71     55744
           1       0.63      0.38      0.47     44195

    accuracy                           0.63     99939
   macro avg       0.63      0.60      0.59     99939
weighted avg       0.63      0.63      0.61     99939



In [None]:
#svm
from sklearn import svm
svm_model = svm.SVC(kernel='rbf')
svm_model.fit(X_train, y_train) 
y_pred = svm_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

In [7]:
#Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier

gbk = GradientBoostingClassifier(random_state=100, n_estimators=150,min_samples_split=100, max_depth=6)
gbk.fit(X_train, y_train)
y_pred = gbk.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.6343807614422874
Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.86      0.72     55380
           1       0.67      0.35      0.46     43879

    accuracy                           0.63     99259
   macro avg       0.65      0.60      0.59     99259
weighted avg       0.64      0.63      0.61     99259



In [8]:
from xgboost import XGBClassifier

model = XGBClassifier()
model.fit(X_train, y_train)
y.pred=model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

ModuleNotFoundError: No module named 'xgboost'