# Classification with a Tabular Vector Borne Disease Dataset

[Follow Here](https://www.kaggle.com/competitions/playground-series-s3e13/overview)

In [None]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Explore Dataset

### Read Dataset 

In [None]:
data_path = './data/input/'
train_df = pd.read_csv(data_path + 'train.csv', index_col=0)
test_df = pd.read_csv(data_path + 'test.csv', index_col=0)

### Investigate Datatype

*All columns are of numerical datatype and except the target column prognosis. prognosis column should be encoded to numerical value.*

In [None]:
for column, t in zip(train_df.columns, train_df.dtypes):
    print(column, t)

In [None]:
for column, t in zip(test_df.columns, test_df.dtypes):
    print(column, t)

### Check for missing values

*No missing values*

In [None]:
for column, count in zip(train_df.columns, train_df.isna().sum()):
    if count > 0:
        print(f"{column}, {count}")

In [None]:
for column, count in zip(test_df.columns, test_df.isna().sum()):
    if count > 0:
        print(f"{column}, {count}")

### Summary Satistics


**Target class:** *Almost equally Distributed*

**Normalization:** *Features values are normalized as part of input dataset*

In [None]:
count = train_df['prognosis'].value_counts()
cols = count.index

target_class_ratio = pd.DataFrame(data = {
    'count': count , 
    'ratio' : round(count/len(train_df), 2)
    })

target_class_ratio

## Train Test Split and Encode Label Values

In [72]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

X = train_df.iloc[:, :-1]
y = train_df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

# create label encoder and fit on train data
label_encoder = LabelEncoder()
label_encoder.fit(y_train)

print(f"Labels: {label_encoder.classes_}")

# encode labels
y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)




X_train = np.concatenate([X_train.astype('int').values, X_test.astype('int').values])
y_train = np.concatenate([y_train, y_test])

Labels: ['Chikungunya' 'Dengue' 'Japanese_encephalitis' 'Lyme_disease' 'Malaria'
 'Plague' 'Rift_Valley_fever' 'Tungiasis' 'West_Nile_fever' 'Yellow_Fever'
 'Zika']


## Plot

### Distributions and Outliers

In [74]:
# X_train.iloc[:, :10].plot.density() # bimodal (2 peaks)

## Model Training and Evaluation

In [75]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import average_precision_score

lr = GradientBoostingClassifier(n_estimators=60, learning_rate=0.01, validation_fraction=0.2)

lr.fit(X_train, y_train)

lr.predict_proba(X_train)


array([[0.06379669, 0.05392881, 0.14427276, ..., 0.16941133, 0.09180573,
        0.04783999],
       [0.09465061, 0.15400003, 0.11380409, ..., 0.12249598, 0.06643938,
        0.07156207],
       [0.09986951, 0.06366361, 0.13739016, ..., 0.16754708, 0.06779492,
        0.10065499],
       ...,
       [0.08427032, 0.16166076, 0.08366949, ..., 0.19848027, 0.05559978,
        0.04711189],
       [0.04783794, 0.12309954, 0.07998544, ..., 0.06689857, 0.0455477 ,
        0.04094485],
       [0.048749  , 0.04262374, 0.08006004, ..., 0.14178351, 0.10332326,
        0.10570539]])

In [76]:
def get_labels(model, X):
    y_prob = model.predict_proba(X)
    y_prob_top_3 = y_prob.argsort()[:,:3]
    result = label_encoder.inverse_transform(y_prob_top_3.reshape(-1, 1)).reshape(-1, 3)
    df = pd.DataFrame(result)
    df['predictions'] = df.iloc[:, 0] + ' ' + df.iloc[:, 1] + ' ' + df.iloc[:, 2]
    return df['predictions']

# lbs = get_labels(model, X_test)

In [77]:
test_out = get_labels(lr, test_df)

  y = column_or_1d(y, warn=True)


In [78]:
test_out.values

array(['Malaria Lyme_disease Yellow_Fever', 'Lyme_disease Zika Plague',
       'Malaria Dengue Plague', 'Lyme_disease Plague Tungiasis',
       'Dengue Chikungunya Rift_Valley_fever',
       'Dengue Chikungunya Rift_Valley_fever',
       'Lyme_disease Dengue Chikungunya', 'Lyme_disease Malaria Zika',
       'Lyme_disease Dengue Chikungunya',
       'Dengue Chikungunya Lyme_disease', 'Malaria Zika Lyme_disease',
       'Malaria Yellow_Fever Zika', 'Zika Malaria Plague',
       'Lyme_disease Plague Zika', 'Dengue Malaria Lyme_disease',
       'Malaria Lyme_disease Dengue', 'Malaria Lyme_disease Chikungunya',
       'Malaria Lyme_disease Plague', 'Lyme_disease Zika Malaria',
       'Malaria Dengue Chikungunya', 'Zika Tungiasis Rift_Valley_fever',
       'Lyme_disease Dengue Malaria', 'Dengue Zika Tungiasis',
       'Dengue Zika Chikungunya', 'Zika Plague Lyme_disease',
       'Lyme_disease Zika Malaria', 'Malaria Lyme_disease Zika',
       'Lyme_disease Chikungunya Malaria', 'Chikungunya 

In [79]:
from datetime import datetime

now = datetime.now().strftime("%Y%m%d_%H%M%S")

output = pd.read_csv('./data/input/sample_submission.csv', index_col=0)
output['prognosis'] = test_out.values

output.to_csv(f"./data/output/test_predicted_{now}.csv", header=True, index=True)