In [None]:
%matplotlib inline
import pandas as pd
import pyodbc
import sqlalchemy
import matplotlib.pyplot as plt
plt.rc("font", size=14)
import numpy as np
import math
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

In [None]:
conn = pyodbc.connect('DRIVER={SQL Server};SERVER=e1001spss1;DATABASE=datamining_temp;UID=datamining_u;PWD=data123')

In [None]:
query = """
    SELECT 
    *
    from
	  pyt.BalancedDataset """

In [None]:
df = pd.pandas.read_sql_query(query, conn,chunksize = 10000 )

In [None]:
df['Target_Flag'].value_counts()

In [None]:
df['Target_Flag'].value_counts().plot(kind = 'pie',y = 'Target_Flag',figsize=(5, 5),title = 'Target_Flag',autopct='%1.1f%%')

In [None]:
df = df[df.columns[1:-3]]

In [None]:
df.describe()

In [None]:
LR = LogisticRegression()

In [None]:
x = df[df.columns[:-1]]
y = df[df.columns[-1:]]

In [None]:
x = x.drop(columns=['General_Insurances','New_CSI','EBanking_Transaction_Segment','Age_Band'])

In [None]:
for col in x.columns:
    if '_LastDate' in col:
        x = x.drop(columns = [col])
    else:        
        if x[col].dtypes=='object':
            x = pd.get_dummies(x, prefix=col + '_', columns=[col])
    
    

In [None]:
def recode(y):
    if y== 'T':
        ret = 1
    else: 
        ret = 0    
    return ret   
    

In [None]:
y['Target_Flag'] = y['Target_Flag'].apply(lambda x : recode(x))


In [None]:
y = np.ravel(y)

In [None]:
LR = LogisticRegression(solver = 'lbfgs',max_iter = 1000,penalty = 'none',class_weight = 'balanced')
#LR = LogisticRegression(solver = 'lbfgs',max_iter = 100,penalty = 'none',class_weight = {0:0.1,1:1})
rfe = RFE(LR, 20)
rfe = rfe.fit(x, y)


In [None]:
x = x[x.columns[rfe.support_]]

In [None]:
LR.fit(x,y)

In [None]:
print(classification_report(y,  LR.predict(x)))

In [None]:
logit_roc_auc = roc_auc_score(y, LR.predict(x))
fpr, tpr, thresholds = roc_curve(y, LR.predict_proba(x)[:,1])
auc_train = roc_auc = auc(fpr, tpr)

plt.plot(fpr, tpr, label='Logistic Regression - train (area = %0.2f)' % auc_train)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")

In [None]:
coefficients = pd.concat([pd.DataFrame(x.columns),pd.DataFrame(np.transpose(LR.coef_))], axis = 1)
coefficients.columns = ['Coefficients','Value']

In [None]:
coefficients.sort_values(by = 'Value', ascending = False)

In [None]:
filename = r'\\e1001spss1\working area\Kontogeorgakos\model_exp.sav'

In [None]:
pickle.dump(LR, open(filename, 'wb'))