# Importing necessary libraries

In [1]:
#importing all the necessary libraries
import eda as eda
import dataframe as df
import visualize as v
import train as t
import pandas as pd

# Regression technique

## Fetching Boston dataframe

In [None]:
boston_df = df.get_boston()

## EDA

In [None]:
boston_df = eda.del_cols(boston_df,['Unnamed: 0'])
feature_boston_df= df.get_feature_df(boston_df,['Price'])
target_boston_df = df.get_target_df(boston_df,['Price'])


In [None]:
#checking null values and information about columns
eda.get_missing_values(boston_df)

In [None]:
eda.get_categorical_cols(feature_boston_df)

In [None]:
eda.get_numerical_cols(feature_boston_df)

In [None]:
#get_lmplot(boston_df,boston_df_features,'Price') - #giving correct result 
#eda.get_heatmap(boston_df) # giving correct result for continuous target variable
eda.get_pairgrid(boston_df,feature_boston_df,target_boston_df)

## Creating Regression models

In [None]:
num_cols, cat_cols = t.get_cols(feature_boston_df)
model,y_test,y_pred = t.model_pipeline(feature_boston_df,target_boston_df,num_cols=num_cols,cat_cols=cat_cols,task='linear_regression')

## Evaluating regression models

In [None]:
t.evaluate_model(y_test,y_pred,'regression')

In [None]:
v.plot_actual_vs_predicted(y_test=y_test,y_pred=y_pred)

In [None]:
v.plot_residuals(y_test,y_pred)

In [None]:
t.get_MAPE(y_test,y_pred)*100#finding out MAPE

In [None]:
print(t.get_training_error(feature_boston_df,target_boston_df,num_cols, cat_cols))

In [None]:
print(t.get_lasso_mape(1,feature_boston_df,target_boston_df))
print(t.get_ridge_mape(0.1,feature_boston_df,target_boston_df))
print(t.get_elasticnet_mape(0.001,feature_boston_df,target_boston_df))

lasso, ridge and elasticnet is worse than linear regression, hence we will continue to use linear regression

# Classification Techniques


## Fetching Churn dataset

In [2]:
churn_df = df.get_churn_df()

## EDA

In [3]:
#some EDA, removing irrelevant columns and changing the datatype of few columns
churn_df = eda.del_cols(churn_df,'customerID')
churn_df['TotalCharges'] = pd.to_numeric(churn_df['TotalCharges'].values,errors='coerce')

In [4]:
#dropping null values and creating a copy of a dataframe
churn_df_new = eda.drop_null_values(churn_df)

In [5]:
#Creating target dataframe from main dataframe
target_churn_df = df.get_target_df(churn_df_new,['Churn'])

In [6]:
target_1 = target_churn_df
target_1.value_counts()

Churn
No       5163
Yes      1869
Name: count, dtype: int64

In [7]:
#creating target as 0 and 1 instead of 'Yes' and 'No' so that it will work with XGBoost 
target_1.loc[target_1['Churn']=='No']=0
target_1.loc[target_1['Churn']=='Yes']=1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  target_1.loc[target_1['Churn']=='No']=0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  target_1.loc[target_1['Churn']=='Yes']=1


In [8]:
# Creating Features from main dataframe
feature_churn_df = df.get_feature_df(churn_df_new,['Churn'])

In [9]:
num_cols = eda.get_numerical_cols(feature_churn_df)
cat_cols = eda.get_categorical_cols(feature_churn_df)

In [10]:
target_2 = target_1.copy()
target_2=target_2.replace({'Yes':1,'No':0})
target_churn_df = target_2.copy()

  target_2=target_2.replace({'Yes':1,'No':0})


## KNN

In [13]:
model_knn,y_test_knn,y_pred_knn = t.model_pipeline(feature_churn_df,target_churn_df,num_cols,cat_cols,'knn')
report_knn=t.evaluate_model(y_test_knn,y_pred_knn,'knn')
print(report_knn)

              precision    recall  f1-score   support

           0       0.82      0.84      0.83      1033
           1       0.53      0.50      0.52       374

    accuracy                           0.75      1407
   macro avg       0.68      0.67      0.67      1407
weighted avg       0.75      0.75      0.75      1407

{'Accuracy Score': '74.9822316986496%', 'Classification report': None}


## Decision Tree

In [19]:
model_dt,y_test_dt,y_pred_dt = t.model_pipeline(feature_churn_df,target_churn_df,num_cols,cat_cols,'decision_tree')
report_dt = t.evaluate_model(y_test_dt,y_pred_dt,'decision_tree')
print(report_dt)

              precision    recall  f1-score   support

           0       0.81      0.80      0.80      1033
           1       0.46      0.48      0.47       374

    accuracy                           0.71      1407
   macro avg       0.64      0.64      0.64      1407
weighted avg       0.72      0.71      0.72      1407

{'Accuracy Score': '71.35749822316987%', 'Classification report': None}


## Random Forest

In [16]:
model_rf,y_test_rf,y_pred_rf = t.model_pipeline(feature_churn_df,target_churn_df,num_cols,cat_cols,'randon_forest')
report_rf = t.evaluate_model(y_test_rf,y_pred_rf,'random_forest')
print(report_rf)

              precision    recall  f1-score   support

           0       0.82      0.90      0.86      1033
           1       0.61      0.45      0.52       374

    accuracy                           0.78      1407
   macro avg       0.72      0.67      0.69      1407
weighted avg       0.76      0.78      0.77      1407

{'Accuracy Score': '77.82515991471215%', 'Classification report': None}


## Naive Bayes

In [22]:
model_nb,y_test_nb,y_pred_nb = t.model_pipeline(feature_churn_df,target_churn_df,num_cols,cat_cols,'naive_bayes')
report_nb = t.evaluate_model(y_test_nb,y_pred_nb,'naive bayes')
print(report_nb)

              precision    recall  f1-score   support

           0       0.91      0.68      0.77      1033
           1       0.47      0.81      0.60       374

    accuracy                           0.71      1407
   macro avg       0.69      0.74      0.69      1407
weighted avg       0.79      0.71      0.73      1407

{'Accuracy Score': '71.14427860696517%', 'Classification report': None}


## SVM

In [25]:
model_sv,y_test_sv,y_pred_sv = t.model_pipeline(feature_churn_df,target_churn_df,num_cols,cat_cols,'svm')
report_sv = t.evaluate_model(y_test_sv,y_pred_sv,'svm')
print(report_sv)

              precision    recall  f1-score   support

           0       0.83      0.90      0.86      1033
           1       0.64      0.48      0.55       374

    accuracy                           0.79      1407
   macro avg       0.74      0.69      0.71      1407
weighted avg       0.78      0.79      0.78      1407

{'Accuracy Score': '79.1044776119403%', 'Classification report': None}


## Logistic Regression

In [28]:
model_lrc,y_test_lrc,y_pred_lrc = t.model_pipeline(feature_churn_df,target_churn_df,num_cols,cat_cols,'logistic_regression')
report_lrc = t.evaluate_model(y_test_lrc,y_pred_lrc,'logistic_regression')
print(report_lrc)

              precision    recall  f1-score   support

           0       0.84      0.89      0.86      1033
           1       0.62      0.52      0.57       374

    accuracy                           0.79      1407
   macro avg       0.73      0.70      0.71      1407
weighted avg       0.78      0.79      0.78      1407

{'Accuracy Score': '78.89125799573561%', 'Classification report': None}


## Adaboost

In [33]:
model_ada,y_test_ada,y_pred_ada = t.model_pipeline(feature_churn_df,target_churn_df,num_cols,cat_cols,'Ada Boost classifier',0)

In [34]:
t.evaluate_model(y_test_ada,y_pred_ada,'classification')

              precision    recall  f1-score   support

           0       0.82      0.82      0.82      1033
           1       0.50      0.49      0.50       374

    accuracy                           0.73      1407
   macro avg       0.66      0.66      0.66      1407
weighted avg       0.73      0.73      0.73      1407



{'Accuracy Score': '73.41862117981522%', 'Classification report': None}

## Gradient Boosting

In [39]:
model_gb,y_test_gb,y_pred_gb = t.model_pipeline(feature_churn_df,target_churn_df,num_cols,cat_cols,'Gradient boost classifier')
t.evaluate_model(y_test_gb,y_pred_gb,'classification')

              precision    recall  f1-score   support

           0       0.83      0.89      0.86      1033
           1       0.63      0.50      0.56       374

    accuracy                           0.79      1407
   macro avg       0.73      0.70      0.71      1407
weighted avg       0.78      0.79      0.78      1407



{'Accuracy Score': '78.82018479033405%', 'Classification report': None}

## XGBOOST

In [None]:
model_x, y_test_x,y_pred_x = t.model_pipeline(feature_churn_df,target_churn_df,num_cols,cat_cols,'xgboost')
t.evaluate_model(y_test_x,y_pred_x,'classification')