## Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns',None)

import warnings
warnings.filterwarnings('ignore')

## Ml-Packages

In [None]:
import xgboost
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV,cross_val_score,GridSearchCV

## Importing Data and Labels

In [None]:
df = pd.read_csv('..\Data\dengue_features_train.csv')
test = pd.read_csv('..\Data\dengue_features_test.csv')
labels = pd.read_csv('..\Data\dengue_labels_train.csv')
labels_totalcases = labels['total_cases']
df['labels'] = labels_totalcases
df = pd.concat([df,test],axis=0,sort=False)

## Data- Housekeeping

In [None]:
df.isna().sum().sum()

In [None]:
missing_columns_list = ['ndvi_ne', 'ndvi_nw',
                        'ndvi_se', 'ndvi_sw', 'precipitation_amt_mm', 'reanalysis_air_temp_k',
                        'reanalysis_avg_temp_k', 'reanalysis_dew_point_temp_k',
                         'reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k',
                        'reanalysis_precip_amt_kg_per_m2',
                         'reanalysis_relative_humidity_percent', 'reanalysis_sat_precip_amt_mm',
                         'reanalysis_specific_humidity_g_per_kg', 'reanalysis_tdtr_k',
                         'station_avg_temp_c', 'station_diur_temp_rng_c', 'station_max_temp_c',
                          'station_min_temp_c', 'station_precip_mm']

In [None]:
for i in missing_columns_list:
    df[i].fillna(df[i].mean(),inplace=True)

## Feature-Encoding

In [None]:
df = pd.get_dummies(df,columns=['city'],drop_first=True)
df['week_start_date_Date'] = pd.to_datetime(df['week_start_date']) 
df.drop(columns=['week_start_date'],inplace=True)

df['Year'] = df['week_start_date_Date'].dt.year
df['Month'] = df['week_start_date_Date'].dt.month
df['Day'] = df['week_start_date_Date'].dt.day

df.drop(columns=['week_start_date_Date'],inplace=True)
df.drop(columns=['year'],inplace=True)
df.drop(columns=['weekofyear'],inplace=True)
df.drop(columns=['reanalysis_specific_humidity_g_per_kg'],inplace=True)
df.drop(columns=['reanalysis_sat_precip_amt_mm'],inplace=True)
df.drop(columns=['reanalysis_avg_temp_k'],inplace=True)

## Co-Relation

In [None]:
features = df.drop(columns=['labels']).columns

mask = np.zeros_like(df[features].corr(), dtype=np.bool) 
mask[np.triu_indices_from(mask)] = True 

f, ax = plt.subplots(figsize=(16, 12))
plt.title('Pearson Correlation Matrix',fontsize=25)

sns.heatmap(df[features].corr(),linewidths=0.25,vmax=0.7,square=True,cmap="BuGn",            
            linecolor='w',annot=True,annot_kws={"size":8},mask=mask,cbar_kws={"shrink": .9}); #"BuGn_r" to reverse 


## Train | Test  Split

In [None]:
train = df.iloc[:1456]
test = df.iloc[1456:]

## X and Y

In [None]:
X = train.drop(columns=['labels'])
y = train[['labels']]

## Feature-Scaling

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

## Training and testing Data

In [None]:
xtrain,xtest,ytrain,ytest = train_test_split(X,y,test_size=0.2,random_state=0)

## Applying Regression Modelling Techniques

## Linear Reg

In [None]:
lr = LinearRegression()
lr = lr.fit(xtrain,ytrain)
pred_lr = lr.predict(xtest)
mean_absolute_error(ytest,pred_lr)

## SVM - Kernel

In [None]:
svr_linear = SVR(kernel='linear')
svr_linear = svr_linear.fit(xtrain,ytrain)
pred_svr_linear = svr_linear.predict(xtest)
mean_absolute_error(ytest,pred_svr_linear)

## SVM - RBF

In [None]:
svr_rbf = SVR(kernel='rbf')
svr_rbf = svr_linear.fit(xtrain,ytrain)
pred_svr_rbf = svr_rbf.predict(xtest)
mean_absolute_error(ytest,pred_svr_rbf)

## Decision Tree

In [None]:
dt = DecisionTreeRegressor(random_state=0)
dt = dt.fit(xtrain,ytrain)
pred_dt = dt.predict(xtest)
mean_absolute_error(ytest,pred_dt)

## RF Regression

In [None]:
rf = RandomForestRegressor(random_state=0)
rf = rf.fit(xtrain,ytrain)
pred_rf = rf.predict(xtest)
mean_absolute_error(ytest,pred_rf)

## AdaBoost

In [None]:
ada = AdaBoostRegressor(random_state=0)
ada = ada.fit(xtrain,ytrain)
pred_ada = ada.predict(xtest)
mean_absolute_error(ytest,pred_ada)

## Gradient-Boosting

In [None]:
gb = GradientBoostingRegressor(random_state=0)
gb = gb.fit(xtrain,ytrain)
pred_gb = gb.predict(xtest)
mean_absolute_error(ytest,pred_gb)

## XG-Boost

In [None]:
xgb = XGBRegressor(random_state=0)
xgb = xgb.fit(xtrain,ytrain)
pred_xgb = xgb.predict(xtest)
mean_absolute_error(ytest,pred_xgb)

## Tuning XG-Boost Regressor

In [None]:
parameters  = {'max_depth':[5,6,7,8],
'learning_rate':[0.5,0.75,0.1,0.2,0.3],
'n_estimators':[300,350,360,400,500] }

#### Random Search CV

In [None]:
xgb = XGBRegressor(random_state=0)
random_search = RandomizedSearchCV(estimator=xgb,param_distributions=parameters,scoring='neg_mean_absolute_error',
                                  cv=10,n_iter=20)
tuned_xgb = random_search.fit(xtrain,ytrain)
tuned_xgb_model = tuned_xgb.best_estimator_
tuned_xgb_model = tuned_xgb_model.fit(xtrain,ytrain)
pred_tuned_xgb_model = tuned_xgb_model.predict(xtest)
mean_absolute_error(ytest,pred_tuned_xgb_model)

#### Grid Search CV

In [None]:
xgb = XGBRegressor(random_state=0)
random_search = GridSearchCV(estimator=xgb,param_grid=parameters,scoring='neg_mean_absolute_error',
                                  cv=10)
tuned_xgb = random_search.fit(xtrain,ytrain)
tuned_xgb_model = tuned_xgb.best_estimator_
tuned_xgb_model = tuned_xgb_model.fit(xtrain,ytrain)
pred_tuned_xgb_model = tuned_xgb_model.predict(xtest)
mean_absolute_error(ytest,pred_tuned_xgb_model)

# Final predictions from Main test Data

In [None]:
test_X = test.drop(columns=['labels'])
test_X = scaler.fit_transform(test_X)
final_predictions = xgb.predict(test_X)
final_predictions = final_predictions.round().astype(int)

## Making Dataset Ready

In [None]:
df_test_final = pd.read_csv('..\Data\dengue_features_test.csv')
df_test_final = df_test_final[['city','year','weekofyear']]
df_test_final['total_cases'] = final_predictions

# Final Submission

In [None]:
#df_test_final.to_csv('..\Data\Submission.csv',index=False)