In [1]:
import pandas as pd 
import numpy as np 
import json
import re
from functools import reduce

In [113]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score

In [3]:
from xgboost import XGBClassifier

In [10]:
from lightgbm import LGBMClassifier

In [11]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [74]:
train = pd.read_csv("dataset/train.csv")
test = pd.read_csv("dataset/test.csv")



In [75]:
train.head()

Unnamed: 0,ID,Prediction
0,user0x410,1
1,user0x432,1
2,user0x16a3,1
3,user0x1ad1,1
4,user0x174d,0


In [76]:
def get_temp(text):
    #print(text)
    temp =  re.findall("\d+\.\d+",text)
    if len(temp)> 0:
        return temp[0]
    else:
        return None

In [77]:
def get_data(filename):
    b = json.load(open("dataset/trainConversations/"+filename+".json"))
    if 'What Symptoms do you have? ' == list(b.keys())[3]:
        text = list(b.values())[0] + list(b.keys())[1] + list(b.values())[1]
        temp =  get_temp(list(b.values())[2])
        symp = None
        tr_type = list(b.values())[4]
        loc = None
    elif "Share your Current location? " not in b:
        text = list(b.values())[0] + list(b.keys())[1]
        temp =  get_temp(list(b.keys())[2])
        symp = list(b.keys())[3]
        tr_type = list(b.keys())[4]
        loc = None
    else:
        text = list(b.values())[0] 
        temp =  get_temp(list(b.values())[1])
        symp = list(b.values())[2]
        tr_type = list(b.values())[3]
        loc = re.findall("\d+\.\d+",list(b.values())[4])

        
    return pd.Series([text,temp,symp,tr_type,loc])

In [78]:
def get_data1(filename):
    b = json.load(open("dataset/testConversations/"+filename+".json"))
    if 'What Symptoms do you have? ' == list(b.keys())[3]:
        text = list(b.values())[0] + list(b.keys())[1] + list(b.values())[1]
        temp =  get_temp(list(b.values())[2])
        symp = None
        tr_type = list(b.values())[4]
        loc = None
    elif "Share your Current location? " not in b:
        text = list(b.values())[0] + list(b.keys())[1]
        temp =  get_temp(list(b.keys())[2])
        symp = list(b.keys())[3]
        tr_type = list(b.keys())[4]
        loc = None
    else:
        text = list(b.values())[0] 
        temp =  get_temp(list(b.values())[1])
        symp = list(b.values())[2]
        tr_type = list(b.values())[3]
        loc = re.findall("\d+\.\d+",list(b.values())[4])

        
    return pd.Series([text,temp,symp,tr_type,loc])

In [99]:
train[['text','temp','symp','tr_type','location']] = train.ID.apply(get_data)
test[['text','temp','symp','tr_type','location']] = test.ID.apply(get_data1)

In [100]:
train.tail()

Unnamed: 0,ID,Prediction,text,temp,symp,tr_type,location
4895,user0x389,0,What started the virus? Search for Covid19 cau...,98.42,Nothing,type_2,"[13.73, 93.87]"
4896,user0x1448,0,Is there a flu vaccine for the virus How did c...,96.72,Nothing,type_1,"[17.62, 76.96]"
4897,user0xf8c,0,I am wondering do I have Corona if my body hur...,98.05,Nothing,type_1,"[14.84, 78.09]"
4898,user0xeb,1,Do you know if fevers are symptoms of Covid-19...,99.72,loss_of_smell,type_2,"[20.4, 91.62]"
4899,user0x1425,0,Can you show me how is the virus spreading? Wh...,97.36,,type_1,


In [101]:
test.tail()

Unnamed: 0,ID,text,temp,symp,tr_type,location
2095,user0x1855,Is there a treatment for the symptoms of covid...,98.17,tiredness,type_3,"[18.73, 72.45]"
2096,user0x17a8,I feel more tired than usual. What is the orig...,101.91,"tiredness', 'sore_throat",type_2,"[19.84, 90.49]"
2097,user0xac2,What do i need to know about the emergence of ...,100.74,loss_of_taste,type_3,"[10.96, 91.62]"
2098,user0x1648,Can I take aspirin if I have corona virus how ...,106.17,"loss_of_taste', 'tiredness",type_1,"[14.84, 90.49]"
2099,user0x62b,Is it safe to use ibuprofen if I have corona v...,96.92,loss_of_taste,type_3,"[13.73, 91.62]"


In [102]:
train.temp = train.temp.astype('float')
test.temp = test.temp.astype('float')

In [112]:
train.shape

(4900, 7)

In [107]:
train.isna().sum()

ID             0
Prediction     0
text           0
temp           0
symp          60
tr_type        0
location      67
dtype: int64

In [108]:
test.isna().sum()

ID           0
text         0
temp         1
symp        33
tr_type      0
location    40
dtype: int64

In [109]:
def clean_text(text):
    if text:
        return re.sub(r"[^a-z ,]","",text.lower())
    return None

In [110]:
train.symp = train.symp.apply(clean_text)
test.symp  = test.symp.apply(clean_text)

In [111]:
# train.symp.fillna("",inplace=True)
# test.symp.fillna("",inplace=True)

### Predict Symptoms

In [91]:
tfidf = TfidfVectorizer()
cnt = CountVectorizer()

In [92]:
symp_vec = cnt.fit_transform(train.symp).toarray()
test_symp_vec = cnt.transform(test.symp).toarray()

In [93]:
train_vec = np.concatenate([symp_vec,np.array(train.temp.values.reshape(-1,1))],axis=1)

In [94]:
test_vec = np.concatenate([test_symp_vec,np.array(test.temp.values.reshape(-1,1))],axis=1)

In [95]:
mdl = RandomForestClassifier(class_weight="balanced",n_estimators=500)

In [96]:
xgb = XGBClassifier()
lgb= LGBMClassifier()

In [97]:
xgb_pipe = Pipeline(steps=[('scaler',StandardScaler()),
                           ('mdl',xgb)])

In [98]:
cross_val_score(xgb_pipe,train_vec,train.Prediction,cv=3,scoring='f1')

array([0.96075778, 0.96098563, 0.95759234])

In [53]:
xgb_pipe.fit(train_vec,train.Prediction)

Pipeline(steps=[('scaler', StandardScaler()),
                ('mdl',
                 XGBClassifier(base_score=0.5, booster=None,
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0, gpu_id=-1,
                               importance_type='gain',
                               interaction_constraints=None,
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=6, min_child_weight=1, missing=nan,
                               monotone_constraints=None, n_estimators=100,
                               n_jobs=0, num_parallel_tree=1, random_state=0,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                               subsample=1, tree_method=None,
                               validate_parameters=False, verbosity=None))])

In [59]:
yhat  = xgb_pipe.predict(test_vec)

In [60]:
res = pd.DataFrame()

In [61]:
res['ID'] = test.ID
res['Prediction'] = yhat

In [62]:
res.to_csv("XGBoost_base.csv",index=False)