In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

df = pd.read_csv('corona_tested_individuals_ver_0083.english.csv')
df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,test_date,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,age_60_and_above,gender,test_indication
0,2020-11-12,0,0,0,0,0,negative,No,male,Other
1,2020-11-12,0,1,0,0,0,negative,No,male,Other
2,2020-11-12,0,0,0,0,0,negative,Yes,female,Other
3,2020-11-12,0,0,0,0,0,negative,No,male,Other
4,2020-11-12,0,1,0,0,0,negative,No,male,Contact with confirmed


In [2]:
df.shape

(2742596, 10)

In [3]:
df['test_indication'].value_counts()

Other                     2547559
Contact with confirmed     170742
Abroad                      24295
Name: test_indication, dtype: int64

In [4]:
gender_mapping = {'female': 0, 'male': 1}
df['gender'] = df['gender'].map(gender_mapping)

In [5]:
corona_result_mapping = {'positive': 1, 'negative': 0}
df['corona_result'] = df['corona_result'].map(corona_result_mapping)

In [6]:
age_60_and_above_mapping = {'Yes': 1, 'No': 0}
df['age_60_and_above'] = df['age_60_and_above'].map(age_60_and_above_mapping)

In [7]:
df['Contact_with_confirmed'] = df['test_indication'].apply(lambda x: 1 if x == 'Contact with confirmed' else 0)

In [8]:
print('Minimum Date',df['test_date'].min())
print('Maximum  Date',df['test_date'].max())

Minimum Date 2020-03-11
Maximum  Date 2020-11-12


In [9]:
df = df[df['test_date'] > '2020-09-01']

In [10]:
df.head()

Unnamed: 0,test_date,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,age_60_and_above,gender,test_indication,Contact_with_confirmed
0,2020-11-12,0,0,0,0,0,0.0,0.0,1.0,Other,0
1,2020-11-12,0,1,0,0,0,0.0,0.0,1.0,Other,0
2,2020-11-12,0,0,0,0,0,0.0,1.0,0.0,Other,0
3,2020-11-12,0,0,0,0,0,0.0,0.0,1.0,Other,0
4,2020-11-12,0,1,0,0,0,0.0,0.0,1.0,Contact with confirmed,1


In [11]:
from sklearn.model_selection import train_test_split
df.sort_values(by='test_date', inplace=True)
test_size = 0.3
train_df, test_df = train_test_split(df, test_size=test_size, shuffle=False)
print("Training set shape:", train_df.shape)
print("Testing set shape:", test_df.shape)


Training set shape: (855513, 11)
Testing set shape: (366649, 11)


In [12]:
train_df = train_df.drop(columns=['test_date','test_indication'])
test_df = test_df.drop(columns=['test_date','test_indication'])

In [13]:
train_df = train_df.dropna()

test_df = test_df.dropna()


In [14]:
X_train = train_df.drop(columns=['corona_result'])
y_train = train_df['corona_result']

X_test = test_df.drop(columns=['corona_result'])
y_test = test_df['corona_result']

rf_classifier = RandomForestClassifier()

rf_param_grid = {
    'n_estimators': [25, 50, 75],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


lr_classifier = LogisticRegression()

lr_param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100]
}

nb_classifier = GaussianNB()


rf_grid_search = GridSearchCV(rf_classifier, rf_param_grid, cv=5, scoring='accuracy')
rf_grid_search.fit(X_train, y_train)

lr_grid_search = GridSearchCV(lr_classifier, lr_param_grid, cv=5, scoring='accuracy')
lr_grid_search.fit(X_train, y_train)

nb_classifier.fit(X_train, y_train)

rf_predictions = rf_grid_search.predict(X_test)
lr_predictions = lr_grid_search.predict(X_test)
nb_predictions = nb_classifier.predict(X_test)

# Print accuracy and classification report for each model
print("Random Forest Model:")
print("Accuracy:", accuracy_score(y_test, rf_predictions))
print("Classification Report:\n", classification_report(y_test, rf_predictions))

print("\nLogistic Regression Model:")
print("Accuracy:", accuracy_score(y_test, lr_predictions))
print("Classification Report:\n", classification_report(y_test, lr_predictions))

print("\nNaive Bayes Model:")
print("Accuracy:", accuracy_score(y_test, nb_predictions))
print("Classification Report:\n", classification_report(y_test, nb_predictions))


Traceback (most recent call last):
  File "C:\Users\goddu\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\goddu\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\goddu\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\goddu\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\goddu\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.

Traceback (most recent call last):
  File "C:\Users\goddu\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\goddu\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\goddu\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\goddu\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\goddu\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.

Traceback (most recent call last):
  File "C:\Users\goddu\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\goddu\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\goddu\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\goddu\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\goddu\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.

Traceback (most recent call last):
  File "C:\Users\goddu\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\goddu\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\goddu\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.89145339        nan 0.89145339        nan 0.89145339]


Random Forest Model:
Accuracy: 0.9593876006117814
Classification Report:
               precision    recall  f1-score   support

         0.0       0.99      0.97      0.98    311872
         1.0       0.46      0.69      0.55     11773

    accuracy                           0.96    323645
   macro avg       0.72      0.83      0.77    323645
weighted avg       0.97      0.96      0.96    323645


Logistic Regression Model:
Accuracy: 0.9657371502726753
Classification Report:
               precision    recall  f1-score   support

         0.0       0.98      0.98      0.98    311872
         1.0       0.53      0.48      0.50     11773

    accuracy                           0.97    323645
   macro avg       0.76      0.73      0.74    323645
weighted avg       0.96      0.97      0.96    323645


Naive Bayes Model:
Accuracy: 0.9520338642648581
Classification Report:
               precision    recall  f1-score   support

         0.0       0.99      0.96      0.97    311872
         