# Round2 

### Dealing with the data

## Get data

In [None]:
import pandas as pd
import numpy as np

dataset = pd.read_csv('/Users/szabonikolett/Desktop/Ironhack-Labs/lab-customer-analysis-round-2/files_for_lab/csv_files/marketing_customer_analysis.csv') 
pd.set_option('display.max_columns', None)  # listing all the columns

dataset.head()

## Show the dataframe shape

In [None]:
dataset.shape

## Standardize header names

In [None]:
cols = []
for column in range(len(dataset.columns)):
    cols.append(dataset.columns[column].lower().replace(' ', '_'))  # >> new best friend, replacing in column names space with _
dataset.columns = cols

dataset.head()

## Numerical and categorical columns

In [None]:
dataset.dtypes

dataset.select_dtypes(np.number).columns

In [None]:
dataset.select_dtypes(object).columns

## Check and deal with NaN values

In [None]:
# Check the occurancy of NaN values / column

dataset.isna().sum() # number of nan values
round(dataset.isna().sum()/len(dataset),4)*100  # % of nan values

In [None]:
# removing vehicle tpye as half of it is missing, makes no sense to work with that much missing values
# unnamed:_0  also removed, it's identical with row index

dataset = dataset.drop(['vehicle_type', 'unnamed:_0'], axis=1)
dataset

In [None]:
# dropping duplicates 

dataset = dataset.drop_duplicates()

## Datetime format 

In [None]:
# Examples of working with datetime format:

#file['date_time'] = pd.to_datetime(file['date_time'], errors='coerce')
#file.head()

In [None]:
# dataset['year'] = pd.DatetimeIndex(dataset['Effective_To_Date']).year # no need for this
# dataset['year'] = pd.DatetimeIndex(dataset['effective_to_date']).year

#dataset['effective_to_date']=pd.to_datetime(dataset['effective_to_date'])
dataset['month'] = pd.DatetimeIndex(dataset['effective_to_date']).month # added month column with 1,2 for January, February

dataset

In [None]:
dataset.shape

In [None]:
# fixing nan values 
dataset['state'] = dataset['state'].fillna('other')

In [None]:
dataset['response'].value_counts(dropna=False)


In [None]:
# filling nans values with no as the number of nos are almost 5 times higher as yes, so more likely the missing values are no as well
dataset['response'] = dataset['response'].fillna('No')

In [None]:
dataset.isna().sum()

In [None]:
# replacing nans with 0 in months_since_last_claim and number_of_open_complaints  
dataset['number_of_open_complaints']=dataset['number_of_open_complaints'].fillna('0.0').astype(float)
dataset['months_since_last_claim']=dataset['months_since_last_claim'].fillna('0.0').astype(float)
dataset.select_dtypes(object)

In [None]:
dataset.isna().sum()

In [None]:
dataset['vehicle_class'].value_counts(dropna=False)

In [None]:
# replacing nan valeus with four door car as the most common value in the set 
dataset['vehicle_class'] = dataset['vehicle_class'].fillna('Four-Door Car')

In [None]:
dataset['vehicle_size'].value_counts(dropna=False)

In [None]:
# following the above said and replaceing nans with the most common value
dataset['vehicle_size'] = dataset['vehicle_size'].fillna('Medsize')

# Round3 
EDA (Exploratory Data Analysis)

## Show DataFrame info

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

from collections import Counter

dataset 
pd.set_option('display.max_columns', None)  

dataset.head()

In [None]:
dataset.shape

## Describe DataFrame

In [None]:
dataset

### Show a plot of the total number of responses

In [None]:
import seaborn as sns

sns.countplot(x = dataset['response'])
plt.ylabel('Total number of Response')

plt.show()

## Show a plot of the response rate by the sales channel 

In [None]:
plt.figure(figsize=(7,4))
sns.countplot(('response'), hue='sales_channel', data=dataset) # hue would bring some colors
plt.ylabel('Response by Sales Channel')
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(y='total_claim_amount' , x='response', data=dataset)
plt.ylabel('Response by Total Claim Amount')
plt.show()

In [None]:
plt.figure(figsize=(5,3))
sns.boxplot(y='income', x='response', data=dataset)
plt.ylabel('Response by Inncome')
plt.show()

# Round 4

## Continuous distributions, linear regression, linear regression

### Check the data types of the columns. Get the numeric data into dataframe called numerical and categorical columns in a dataframe called categoricals. 

In [None]:
from sklearn.preprocessing import MinMaxScaler    
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline

In [None]:
numerical = dataset.select_dtypes(include = np.number) 
pd.set_option('display.max_columns', None)  

numerical

In [None]:
categorical = dataset.select_dtypes(include = object)
pd.set_option('display.max_columns', None)  
categorical

In [None]:
dataset = dataset.drop(['customer', 'effective_to_date'], axis=1) 

In [None]:
#when I drop a colum from my numerical db, I will also need to drop that from my origina data

## Check the normality of the numerical variables visually

### Use seaborn library to construct distribution plots for the numerical variables

In [None]:
for col in numerical.columns:
    sns.displot(numerical[col], kde=True) # kde=True >> putting the line in 
    
plt.show()

### Use Matplotlib to construct histograms

In [None]:
#fig, ax = plt.subplots()
for col in numerical.columns:
    numerical[col].hist()
    plt.show()

### Check the multicollinearity between the features. Please note that we will use the column total_claim_amount later as the target variable


In [None]:
correlations = round(numerical.corr(),2)
correlations

In [None]:
sns.heatmap(correlations, annot=True) # printing 2 decimals with annot=True 
plt.show

We have a negativ high correlation between income and total_claim_amount, meaning if one of them increases the other one would decrease. 
The correlation between monthly_premium_auto and total_claim_amount is (positiv) strong, if one increases so would the other too. 

# Round 5

## X-y split.
## Normalizing (numerical).

In [None]:
y=dataset['total_claim_amount'] # setting target on y axis
X=dataset.drop(['total_claim_amount'], axis=1) # »» setting features on X

In [None]:
X_num = X.select_dtypes(np.number) 
X_cat = X.select_dtypes(object)

print (X_num.shape, X_cat.shape)  

In [None]:
X_cat

In [None]:
X_num.describe().T # checking the range of the values 

In [None]:
# normalizing all the values between 0-1 

transformer = MinMaxScaler().fit(X_num) 
X_normalized = transformer.transform(X_num)
print(X_normalized.shape)
X_normalized # if I run it, it's still an array
X_num_scale = pd.DataFrame(X_normalized, columns=X_num.columns) # turning into a dataframe to concat with onehot X numeric
X_num_scale.head()

In [None]:
# normalizing by making data distributed with mean=0 and std=1
transformer = StandardScaler().fit(X_num)
X_standardized = transformer.transform(X_num)
print(X_standardized.shape)
X_num_standard = pd.DataFrame(X_standardized, columns=X_num.columns)
X_num_standard

# Round 6

One Hot/Label Encoding (categorical).
Concat DataFrames
Linear Regression

Train-test split.
Apply linear regression.
Model Validation


In [None]:
#encoder.categories_

In [None]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(drop='first').fit(X_cat) 
cols=encoder.get_feature_names(input_features=X_cat.columns) 
X_cat_encode = pd.DataFrame(encoder.transform(X_cat).toarray(),columns=cols) 

X_cat_encode

### Contact DataFrames, Linear Regression

In [None]:
X=pd.concat([X_num_scale, X_cat_encode], axis=1) 
X.describe()

## Train-test split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(X_train.shape) 
print(X_test.shape)
print(y_train.shape)
print(y_test.shape) 

In [None]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error, r2_score

from sklearn import linear_model

In [None]:
lm = linear_model.LinearRegression()
lm.fit(X_train, y_train)
print(lm.score(X_train,y_train))
y_pred = lm.predict(X_train)
print(mean_squared_error(y_pred,y_train))

In [None]:
print(y_pred)

In [None]:
print(y_train)  # training and testing the random selectin of 20%

### Description:
R2.
MSE.
RMSE.
MAE.

In [None]:
from sklearn.metrics import mean_squared_error 
mse=mean_squared_error(y_test,predictions_test)
mse

In [None]:
rmse = np.sqrt(mean_squared_error(y_test,predictions_test))
rmse

In [None]:
y_test.mean()

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score

mae = mean_absolute_error(y_test, predictions_test)
print(mae)

In [None]:
# test model is working, result is within the range when prediction is considered fine
r2 = r2_score(y_test, predictions_test) 
r2

## Round 7

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


from collections import Counter

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import OneHotEncoder

from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
dataset2 = pd.read_csv('/Users/szabonikolett/Desktop/Ironhack-Labs/lab-customer-analysis-round-2/files_for_lab/csv_files/marketing_customer_analysis.csv') 

In [3]:
def clean_data(df):
    cols = []
    for column in range(len(df.columns)):
        cols.append(df.columns[column].lower().replace(' ', '_'))  
    df.columns = cols
        
    df = df.drop(['vehicle_type', 'unnamed:_0'], axis=1)
    
    df['month'] = pd.DatetimeIndex(df['effective_to_date']).month
    
    df['state'] = df['state'].fillna('other')
    
    df['response'] = df['response'].fillna('No')
    
    df['number_of_open_complaints']=df['number_of_open_complaints'].fillna('0.0').astype(float)
    df['months_since_last_claim']=df['months_since_last_claim'].fillna('0.0').astype(float)
    
    df['vehicle_class'] = df['vehicle_class'].fillna('Four-Door Car')
    
    df['vehicle_size'] = df['vehicle_size'].fillna('Medsize')
    
    return df
  

In [4]:
dataset10 = clean_data(dataset2) # saving the function's result in a new variable


In [5]:
def scalingsteps(A):
    X = A.drop(['total_claim_amount'],axis=1) # every feature minus the target
    y = A['total_claim_amount'] # setting label
    
    X=X.drop(columns=['effective_to_date','month', 'customer'],axis=1) # dropping unneeded columns
    
    X_num = X.select_dtypes(np.number) 
    X_cat = X.select_dtypes(object)
    
    # Normalizing data: make data range from 0 - 1, instead of from min to max
    
    transformer = MinMaxScaler().fit(X_num)
    x_normalized = transformer.transform(X_num)
    print(x_normalized.shape) #was dataframe but turned into array
    x_normalized
    x_normalized = pd.DataFrame(x_normalized, columns=X_num.columns) # converting back to dataframe
    
    encoder = OneHotEncoder().fit(X_cat)
    cols=encoder.get_feature_names_out(input_features=X_cat.columns)
    encoded = pd.DataFrame(encoder.transform(X_cat).toarray(), columns=cols)
    encoded.head()
    
    X = pd.concat([x_normalized, encoded], axis=1)  
    X = X.reset_index(drop=True)
    display(x_normalized.corr())
    
    return X, y


In [6]:
X, y = scalingsteps(dataset10)

(10910, 7)


Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies
customer_lifetime_value,1.0,0.028306,0.403069,0.017189,0.01696,-0.030616,0.016024
income,0.028306,1.0,-0.009788,-0.016671,-0.005903,0.008425,-0.00996
monthly_premium_auto,0.403069,-0.009788,1.0,0.007845,0.018953,-0.010059,-0.020505
months_since_last_claim,0.017189,-0.016671,0.007845,1.0,-0.036081,0.036455,-0.000649
months_since_policy_inception,0.01696,-0.005903,0.018953,-0.036081,1.0,0.004165,-0.009123
number_of_open_complaints,-0.030616,0.008425,-0.010059,0.036455,0.004165,1.0,0.001681
number_of_policies,0.016024,-0.00996,-0.020505,-0.000649,-0.009123,0.001681,1.0
