In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE 
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import seaborn as sns

In [None]:
df = pd.read_csv("ml_project1_data.csv")

# 1.0 Análise Exploratória de Dados

In [None]:
df.isna().sum().plot(kind='bar')

In [None]:
df.dtypes

In [None]:
MNT_COLUMNS = ['MntFruits', 'MntWines', 'MntMeatProducts', 'MntFishProducts', 'MntGoldProds', 'MntSweetProducts']
NUM_COLUMNS = ['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth']

### 1.1 - Detecção da presença outliers

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(18, 8))
ax1 = sns.boxplot(data=df[MNT_COLUMNS], palette="Set1", ax=axes[0])
ax2 = sns.boxplot(data=df[NUM_COLUMNS], palette="Set2",ax=axes[1])

In [None]:
df['Income'].describe()

### 1.2 - Matriz de correlação do dataset completo

In [None]:
plt.figure(figsize=(14, 10))
sns.heatmap(df.corr(), cmap='YlGnBu', annot=True, fmt='.2f', vmin=-1);

In [None]:
fig, axes = plt.subplots(3, 2, figsize=(22, 12))
ax1 = axes[0, 0].bar(df[MNT_COLUMNS].columns, df[MNT_COLUMNS].sum())
ax2 = axes[0, 1].bar(df[NUM_COLUMNS].columns, df[NUM_COLUMNS].sum())
ax3 = axes[1, 0].bar(df['Education'].unique(), df['Education'].value_counts(normalize=True)*100)
ax4 = axes[1, 1].bar(df['Marital_Status'].unique(), df['Marital_Status'].value_counts())
ax5 = axes[2, 0].barh(df['Year_Birth'].value_counts().sort_values(ascending = False).head(20).index, df['Year_Birth'].value_counts().sort_values(ascending = False).head(20))
ax4 = axes[2, 1].bar(df['Kidhome'].value_counts().index, df['Kidhome'].value_counts())

In [None]:
df['Kidhome'].value_counts()

In [None]:
df.drop(['ID', 'Z_CostContact', 'Z_Revenue'], axis = 1).hist(figsize=(22,12))

In [None]:
sns.pairplot(df, vars = MNT_COLUMNS, hue='Response', diag_kind = 'hist')
plt.show()

In [None]:
sns.pairplot(df, vars = NUM_COLUMNS, hue='Response', diag_kind = 'hist')
plt.show()

In [None]:
df.Response.value_counts().sort_values().plot(kind = 'barh')

# 2.0 - Pré Processamento

In [None]:
df = df.copy().dropna()

In [None]:
df = df.loc[df['Year_Birth'] > 1900]

In [None]:
df = df.loc[df['MntMeatProducts'] < 1000]

In [None]:
df = df.loc[df['Income'] < 150000]

In [None]:
df['Marital_Status'].value_counts()

Divorced = Single

Widow = Single

Alone = Single

Absurd = Single

YOLO = Single

In [None]:
df['Marital_Status'] = df['Marital_Status'].apply(lambda x: 'Single' if x in ['Single' , 'Widow' , 'Alone' , 'Absurd' , 'YOLO'] else x)

In [None]:
df['Marital_Status'].value_counts()

In [None]:
df = df.drop(['ID', 'Dt_Customer', 'Z_CostContact', 'Z_Revenue'], axis = 1)

In [None]:
df.Income.plot(kind = 'box')

In [None]:
df[NUM_COLUMNS] = StandardScaler().fit_transform(df[NUM_COLUMNS])
df[MNT_COLUMNS] = StandardScaler().fit_transform(df[MNT_COLUMNS])
df[['Income', 'Recency']] = StandardScaler().fit_transform(df[['Income', 'Recency']])

# 3.0 - Feature Engineering

In [None]:
plt.figure(figsize=(12,10))
cor = df.corr()

In [None]:
#Correlation with output variable
cor_target = abs(cor["Response"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.10]

In [None]:
relevant_features = list(relevant_features.index)

In [None]:
relevant_features.append('Education')
relevant_features.append('Marital_Status')

In [None]:
df = df[relevant_features].copy()


In [None]:
df

In [None]:
# df['Year_Birth'].sub(2022, axis = 0).abs()
df = pd.get_dummies(df)

# 4.0 - Modelagem

In [None]:
X = df.drop('Response', axis = 1)
y = df['Response']

In [None]:
y.value_counts()

In [None]:
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## 4.1 - Random Forest

In [None]:
param_grid = { 
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [4, 5, 6, 7, 8],
    'criterion': ['gini', 'entropy']
}
rfc=RandomForestClassifier(random_state=42)
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train)


In [None]:
CV_rfc.best_params_

In [None]:
clf = RandomForestClassifier(random_state = 42, criterion= 'gini',max_depth= 8,max_features= 'log2',n_estimators= 200)
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
print(metrics.classification_report(y_test, y_pred))

In [None]:
metrics.plot_confusion_matrix(clf, X_test, y_test)

In [None]:
metrics.plot_roc_curve(clf, X_test, y_test)

## 4.2 - Support Vector Machine

In [None]:
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf', 'poly']}
grid_SVM = GridSearchCV(SVC(), param_grid, cv = 5)
grid_SVM.fit(X_train, y_train)

In [None]:
grid_SVM.best_params_

In [None]:
svm = SVC(C=1000, gamma = 1, kernel = 'rbf')

In [None]:
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

In [None]:
print(metrics.classification_report(y_test, y_pred))

In [None]:
metrics.plot_confusion_matrix(svm, X_test, y_test)

In [None]:
metrics.plot_roc_curve(svm, X_test, y_test)