In [1]:
import warnings
warnings.filterwarnings(action='once')

from sklearn.exceptions import ConvergenceWarning
ConvergenceWarning('ignore')



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns # used for plot interactive graph. 
from sklearn.model_selection import train_test_split # to split the data into two parts
from collections import Counter
from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LogisticRegression # to apply the Logistic regression
from sklearn.model_selection import RandomizedSearchCV  # Randomized search on hyper parameters.

from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler, RobustScaler
from scipy.stats import boxcox
from sklearn import metrics # for the check the error and accuracy of the model

from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import SMOTE

In [None]:
#reading and concatinating customer data

raw_1 = pd.read_excel('D:/UK/assignment/DSA8023/WB1_Energia_Challenge_March_2023_Data.xlsx', sheet_name=1)
raw_2 = pd.read_excel('D:/UK/assignment/DSA8023/WB2_Energia_Challenge_March_2023_Data.xlsx', sheet_name=1)
raw_d = pd.concat([raw_1, raw_2])

In [None]:
# Defining constants

is_zero_bill_value_allowed = False

billing_months = ['bill_1_2021', 'bill_2_2021',
       'bill_3_2021', 'bill_4_2021', 'bill_5_2021', 'bill_6_2021',
       'bill_1_2022', 'bill_2_2022', 'bill_3_2022', 'bill_4_2022',
       'bill_5_2022', 'bill_6_2022', 'bill_1_2023']

categorical_columns = ['title', 'mosaicType', 'agedBand', 'saStatus', 'signedUpGroup']

# based on the fact that ifn one reading is missed for a particular month, the reading will be accumulated to next month.
# the reading is averaged between missing and accumulated months.
# only one missed month is accepted thus eleminating rows with consecutive zeros
# end result will have no zero readings
def average_missing_months(x):
    for index, month in enumerate(billing_months[:-1]):
        next_month = billing_months[index+1]
        if x[month] == 0:
            if(x[next_month] != 0):
                x[month] = x[next_month] / 2
                x[next_month] = x[next_month] / 2
            else:
                break
    return x

def checkCateoricalInfo(data, cols):
    for col in cols:
        unique = data[col].unique()
        print('{}({}) - {}'.format(col, len(unique), unique));
        print()

def scale_data_standard_scaler(data, feature_cols):
    scaled_data = data.copy()
    features = scaled_data[feature_cols]
    scaler = StandardScaler().fit(features.values)
    features = scaler.transform(features.values)
    scaled_data[feature_cols] = features
    return scaled_data


In [None]:
df = raw_d.copy()

checkCateoricalInfo(df, categorical_columns)

le = LabelEncoder()
 
# keeping nan values in title, agedband as they have relatively less effect on EV classification
df['title'] = le.fit_transform(df['title'])
df['mosaicType'] = le.fit_transform(df['mosaicType'])
df['agedBand'] = le.fit_transform(df['agedBand'])
df['signedUpGroup'] = le.fit_transform(df['signedUpGroup'])
df['saStatus'] = le.fit_transform(df['saStatus'])


# manually assigning numeric values to EV related categories
rename_props = { 'EV': {'N': 0, 'Y': 1},
                  'EV_New_or_Old': {'Non-EV Customers': 0, 'Switched to EV': 1, 'New EV Customer': 1}} 

df = df.replace(rename_props)


In [None]:
def visualizeFeatureCount(dataset, feature = 'EV'):
    totalData = len(dataset)
    hasFeature = dataset[feature].sum()
    
#     print(totalData, has)
    hasNoFeature = totalData - hasFeature

    hasFeature_per = round(hasFeature/totalData * 100, 2)
    hasNoFeature_per = round(hasNoFeature/totalData * 100, 2)

    plt.figure()
    sns.countplot(data = dataset, x=feature)
    plt.annotate('No {}: {}'.format(feature, hasNoFeature), xy=(-0.3, 15000), xytext=(-0.3, 3000), size=12)
    plt.annotate('{}: {}'.format(feature, hasFeature), xy=(0.7, 15000), xytext=(0.9, 3000), size=12)
    plt.annotate(str(hasNoFeature_per)+" %", xy=(-0.3, 15000), xytext=(-0.1, 8000), size=12)
    plt.annotate(str(hasFeature_per)+" %", xy=(0.7, 15000), xytext=(0.9, 8000), size=12)
    plt.show()

def plot_hist_plots(t_data, t_features, fig_size= 8):
    f = plt.figure(figsize=(fig_size,fig_size))
    for i, feature in enumerate(t_features):
        t = f.add_subplot(round(len(t_features)/3) + 1,3, i+1)
        sns.histplot(t_data[feature])
        t.title.set_text(feature)
    plt.show();

def plot_corr_plot(ds):
    corr = ds.corr(numeric_only=True) # .corr is used to find corelation
    f,ax = plt.subplots(figsize=(8, 7))
    sns.heatmap(corr, cbar = True,  square = True, annot = False, fmt= '.1f', 
                xticklabels= True, yticklabels= True
                ,cmap="coolwarm", linewidths=.5, ax=ax)
    plt.title('CORRELATION MATRIX - HEATMAP', size=18);

In [None]:
df = df.apply(average_missing_months, axis=1)

In [None]:

# initial data cleaning

# deleting date related 
cols_to_delete = ['StartDate','ContractStartDateEV','contractStartDate','contractEndDate','saStatus', 'accountID']
if cleaned_data.columns.isin(cols_to_delete).any():
    df = df.drop(columns=cols_to_delete)

if is_zero_bill_value_allowed:
    df['zero_count'] = df[billing_months].isin([0]).sum(axis=1)
    cleaned_data = df.copy()
else:
    cleaned_data = df[~df[billing_months].isin([0]).any(axis=1)]



visualizeFeatureCount(cleaned_data)    

In [None]:
# plot heat map of correlation features
plot_corr_plot(cleaned_data)

In [None]:
# check distribution of billing features before transforming
plot_hist_plots(cleaned_data, cleaned_data.columns, fig_size=12)


In [None]:
# Applying log transformation to fix right skewness in billing period distribution
# sqrt_t_data = cleaned_data.copy()

# for month in billing_months:
#     sqrt_t_data[month] = np.sqrt(sqrt_t_data[month])

# plot_hist_plots(sqrt_t_data, billing_months)

# perform logistic regression

from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning


def perform_sampling(x_train, y_train, y_test, sampling):
    print('Training set: {}'.format(Counter(y_train)))
    print('Testing set: {}'.format(Counter(y_test)))
    
    if sampling == 'under':
        from imblearn.under_sampling import NearMiss 
        sampler = NearMiss(version=1, n_neighbors=3)
    elif sampling == 'over':
        from imblearn.over_sampling import SMOTE 
        sampler = SMOTE(random_state=42)

    x_train, y_train = sampler.fit_resample(x_train, y_train)
    print('Training set: {}'.format(Counter(y_train)))
    print('Testing set: {}'.format(Counter(y_test)))
    
    return x_train, y_train

@ignore_warnings(category=ConvergenceWarning)
def perform_logistic_reg(data, class_col, sampling=None, solver='lbfgs'):
    x = data.drop(class_col, axis=1)  
    y = data[class_col]

    X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.20, stratify=y, random_state=42)
    
    if sampling:
        X_train, y_train = perform_sampling(X_train, y_train,y_test,  sampling)

    LR = LogisticRegression(C=0.0005, random_state=0, solver=solver)
    LR.fit(X_train, y_train)
    y_pred = LR.predict(X_test)
    get_metrics(y_pred, X_test, y_test, LR, x, y, sampling)
    return LR

def get_metrics(y_pred, X_test, y_test, LR, x, y, sampling):
    print('Accuracy:', metrics.accuracy_score(y_pred,y_test))

    ## 5-fold cross-validation 
    cv_scores =cross_val_score(LR, x, y, cv=5)

    # Print the 5-fold cross-validation scores
    print()
    print(classification_report(y_test, y_pred))
    print()
    print("Average 5-Fold CV Score: {}".format(round(np.mean(cv_scores),4)),
          ", Standard deviation: {}".format(round(np.std(cv_scores),4)))

    plt.figure(figsize=(4,3))
    ConfMatrix = confusion_matrix(y_test,LR.predict(X_test))
    sns.heatmap(ConfMatrix,annot=True, cmap="Blues", fmt="d", 
                xticklabels = ['No EV', 'EV'], 
                yticklabels = ['No EV', 'EV'])
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.title("Confusion Matrix - Logistic Regression - {}".format(sampling));

In [None]:
LR2 = perform_logistic_reg(cleaned_data, class_col='EV', sampling='over')

In [None]:
# scaled_data = scale_data_standard_scaler(cleaned_data, billing_months)
# LR4 = perform_logistic_reg(scaled_data, class_col='EV', sampling='over' )

In [None]:
df.shape

In [None]:
x = df.copy()

# x = df[df.EV == 1]

EV = df['EV']

print(x.shape)

x.head()

x= x.drop(columns=['EV'])

x['EV_predicted'] = LR2.predict(x)
x['EV'] = 
# visualizeFeatureCount(x, feature='EV_predicted')    

x.head()