In [228]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso, Ridge
from sklearn.metrics import f1_score,accuracy_score,confusion_matrix

In [229]:
os.chdir("C:/Users/admin/Desktop/Spring GRA/newdata")

In [230]:
df=pd.read_excel("Call Center Data Framework GSU Clean.xlsx",sheet_name='Master')

In [231]:
df.shape

(76, 68)

In [232]:
df.isnull().sum()

Forecast Month                                                                  0
Model Run Month                                                                 0
Contract Expiration Month 1                                                     0
Contract Expiration Month 2                                                     0
Enrollments (Call Volume)                                                       3
                                                                               ..
Expected Enrollment Rate (EER) - Fixed Roll-Off (1 Month Carryover)             0
Expected Enrollment Rate (EER) - Existing Customer Email (1 Month New)          0
Expected Enrollment Rate (EER) - Existing Customer Email (1 Month Carryover)    0
Expected Enrollment Rate (EER) - Blend & Extend (1 Month New)                   0
Expected Enrollment Rate (EER) - Blend & Extend (1 Month Carryover)             0
Length: 68, dtype: int64

In [233]:
Threshold = np.mean(df['Escalations (Call Volume)'])
Threshold

149.67123287671234

In [234]:
df['Target'] = np.where(df['Escalations (Call Volume)']>Threshold,1,0)
df.head()

Unnamed: 0,Forecast Month,Model Run Month,Contract Expiration Month 1,Contract Expiration Month 2,Enrollments (Call Volume),Enrollments (Average Handling Time),Escalations (Call Volume),Escalations (Average Handling Time),Other (Call Volume),Other (Average Handling Time),...,Expected Enrollment Rate (EER) - Saveback (1 Month Carryover),Expected Enrollment Rate (EER) - Convert & Extend (1 Month New),Expected Enrollment Rate (EER) - Convert & Extend (1 Month Carryover),Expected Enrollment Rate (EER) - Fixed Roll-Off (1 Month New),Expected Enrollment Rate (EER) - Fixed Roll-Off (1 Month Carryover),Expected Enrollment Rate (EER) - Existing Customer Email (1 Month New),Expected Enrollment Rate (EER) - Existing Customer Email (1 Month Carryover),Expected Enrollment Rate (EER) - Blend & Extend (1 Month New),Expected Enrollment Rate (EER) - Blend & Extend (1 Month Carryover),Target
0,2017-01-01,2016-11-01,2017-02-01,2017-03-01,9131.0,599.0,456.0,622.0,84566.0,435.0,...,0.0,0.01,0.0,0.1,0.0,0.03,0.0,0.0,0.0,1
1,2017-02-01,2016-12-01,2017-03-01,2017-04-01,8639.0,615.0,528.0,589.0,78274.0,422.0,...,0.0,0.01,0.01,0.1,0.1,0.03,0.03,0.0,0.0,1
2,2017-03-01,2017-01-01,2017-04-01,2017-05-01,9374.0,615.0,612.0,577.0,84834.0,416.0,...,0.0,0.01,0.01,0.1,0.1,0.03,0.03,0.0,0.0,1
3,2017-04-01,2017-02-01,2017-05-01,2017-06-01,6005.0,634.0,427.0,551.0,68185.0,423.0,...,0.0,0.01,0.01,0.1,0.1,0.03,0.03,0.0,0.0,1
4,2017-05-01,2017-03-01,2017-06-01,2017-07-01,7245.0,626.0,439.0,599.0,72970.0,415.0,...,0.0,0.01,0.01,0.1,0.1,0.03,0.03,0.0,0.0,1


In [235]:
df.drop(columns = ['Enrollments (Call Volume)','Enrollments (Average Handling Time)','Escalations (Call Volume)','Escalations (Average Handling Time)',
                  'Other (Call Volume)','Other (Average Handling Time)','Spanish (Call Volume)','Spanish (Average Handling Time)','Expected Enrollment Rate (EER) - Blend & Extend (1 Month New)','Expected Enrollment Rate (EER) - Blend & Extend (1 Month Carryover)'],inplace = True)
df.dropna(inplace = True)
df.shape

(73, 59)

In [236]:
df['Forecast_year'] = pd.to_datetime(df['Forecast Month']).dt.year
df['Forecast_month'] = pd.to_datetime(df['Forecast Month']).dt.month
df['Contract Expiration Month 1_year'] = pd.to_datetime(df['Contract Expiration Month 1']).dt.year
df['Contract Expiration Month 1_month'] = pd.to_datetime(df['Contract Expiration Month 1']).dt.month
df['Contract Expiration Month 2_year'] = pd.to_datetime(df['Contract Expiration Month 2']).dt.year
df['Contract Expiration Month 2_month'] = pd.to_datetime(df['Contract Expiration Month 2']).dt.month

In [237]:
cyclic_features = ['Forecast_year','Forecast_month','Contract Expiration Month 1_year','Contract Expiration Month 1_month',
                   'Contract Expiration Month 2_year','Contract Expiration Month 2_month']
for col in cyclic_features:
    max_value = df[col].max()
    df[col+'_sin'] = np.sin(df[col] * (2 * np.pi / max_value))
    df[col+'_cos'] = np.cos(df[col] * (2 * np.pi / max_value))

In [238]:
df.drop(columns = ['Contract Expiration Month 1','Contract Expiration Month 2','NYMEX Settlement Date','Forecast Month','Model Run Month'],inplace = True)
df.shape

(73, 72)

In [239]:
# Split the data into features and target
X = df.copy()
X.drop(['Target'],axis = 1,inplace = True)
y = df['Target']

In [240]:
X.shape

(73, 71)

In [241]:
tss = TimeSeriesSplit(n_splits = 4,test_size=15)
for train_index, test_index in tss.split(X):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index,:]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(58, 71) (15, 71) (58,) (15,)


In [242]:
X_train.head()

Unnamed: 0,Current Active Customer Count,Forecasted Customer Count,Expiring Fixed Price Customers (1 Month),Expiring Guaranteed Bill Customers (1 Month),Expiring Fixed Price Customers (2 Months),Expiring Guaranteed Bill Customers (2 Months),Call Center Closures,NYMEX Price (Forecast),NYMEX Price (Actual),Firm Model Residential Fixed Price,...,Forecast_month_sin,Forecast_month_cos,Contract Expiration Month 1_year_sin,Contract Expiration Month 1_year_cos,Contract Expiration Month 1_month_sin,Contract Expiration Month 1_month_cos,Contract Expiration Month 2_year_sin,Contract Expiration Month 2_year_cos,Contract Expiration Month 2_month_sin,Contract Expiration Month 2_month_cos
0,458397,461144.356067,11869,3546,13173,4426,6,3.237,2.764,0.531681,...,0.5,0.8660254,-0.018634,0.999826,0.8660254,0.5,-0.018634,0.999826,1.0,6.123234000000001e-17
1,465489,462642.356067,13173,4426,11491,4866,4,3.346,3.232,0.541255,...,0.866025,0.5,-0.018634,0.999826,1.0,6.123234000000001e-17,-0.018634,0.999826,0.8660254,-0.5
2,468321,470958.0,11491,4866,12519,4799,4,3.825,3.93,0.550519,...,1.0,6.123234000000001e-17,-0.018634,0.999826,0.8660254,-0.5,-0.018634,0.999826,0.5,-0.8660254
3,468742,469163.0,12519,4799,13481,3736,5,3.38,3.391,0.558495,...,0.866025,-0.5,-0.018634,0.999826,0.5,-0.8660254,-0.018634,0.999826,1.224647e-16,-1.0
4,467616,464658.0,13481,3736,12048,3259,5,2.868,2.627,0.564534,...,0.5,-0.8660254,-0.018634,0.999826,1.224647e-16,-1.0,-0.018634,0.999826,-0.5,-0.8660254


In [243]:
lr = LogisticRegression(solver='liblinear',penalty='l2',random_state=0)
lr.fit(X_train, y_train)
# Predict the test data
y_pred = lr.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9333333333333333


In [244]:
## Standardization
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(X)
scaled_data=scaler.transform(X)

In [245]:
##  Applying PCA Algorithms
from sklearn.decomposition import PCA

In [246]:
pca=PCA(0.95)

In [247]:
data_pca=pca.fit_transform(scaled_data)

In [248]:
pca.explained_variance_ratio_
pca.n_components_

19

In [249]:
data_pca=pd.DataFrame(data_pca)
data_pca.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,-6.550017,3.594754,-0.552372,-3.677215,-0.20755,2.865464,6.933186,3.562052,-0.203644,1.416909,0.029064,0.023473,1.171257,1.901073,-1.294911,-1.421406,2.404549,-0.732899,0.010212
1,-6.298609,3.197927,-1.286735,-2.067683,-1.630875,-1.209238,1.396535,-0.6633,0.50208,-0.186098,-1.5016,-0.487686,-0.675057,-0.291229,-0.705473,-0.508921,-0.81452,-0.805889,0.924186
2,-6.141108,2.115446,-2.082964,-2.887818,-1.965207,-0.876642,0.490588,-1.299068,-0.383924,-0.656187,-1.412817,0.06025,-0.237671,-0.111754,0.213175,-0.33697,-0.977918,-0.990936,0.612422
3,-6.55308,-0.11908,-2.285568,-3.116231,-1.580011,-0.495003,0.339689,-1.116772,-0.728035,-1.634044,-0.247652,0.531329,0.626253,-0.365855,0.281608,-0.001811,-0.092249,-0.137583,0.499616
4,-6.987731,-1.395707,-1.691994,-2.759889,-1.609407,-0.457991,0.476115,-1.042347,-0.461767,-1.878109,0.188883,0.899418,0.11842,-0.024766,0.210405,0.355135,0.347846,-0.353897,0.120726


In [250]:
X_train_pca, X_test_pca, y_train, y_test = train_test_split(data_pca,y,train_size=0.8,test_size=0.2,random_state=0)
print(X_train_pca.shape, X_test_pca.shape, y_train.shape, y_test.shape)

(58, 19) (15, 19) (58,) (15,)


In [251]:
model = LogisticRegression(solver='newton-cg',penalty='l2',random_state=0)
model.fit(X_train_pca, y_train)
model.score(X_test_pca, y_test)

0.8666666666666667

In [252]:
from sklearn.svm import SVC
# Create an SVM object
svm = SVC(kernel='rbf', C=0.001,gamma=0.001)

# Train the SVM model
svm.fit(X_train, y_train)

# Predict the labels of the test set
y_pred = svm.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy score
print('Accuracy:', accuracy)

Accuracy: 0.7333333333333333
