<a id='import'></a>

# <font color = '#cc9900'> 1. Import Data </font>

<a id='lib'></a>

### 1.1. Import the needed libraries

__`Step`__ Import the following libraries/functions: 
    - pandas as pd 
    - pyplot from matplotlib as plt 
    - seaborn as sns
    - MinMaxScaler from sklearn.preprocessing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

<a id='integrate'></a>

### 1.2. Import and integrate data

__`Step`__ Import the excel file `demographic.xlsx` and store it in the object `demographic`

In [None]:
demographic = pd.read_excel('demographic.xlsx')
demographic

__`Step`__ Import the csv file `firmographic.csv` and store it in the object `firmographic`.

In [None]:
firmographic = pd.read_csv('firmographic.csv')
firmographic

__`Step`__ Merge the data from the two previous files and store it in the object `df`

In [None]:
df = demographic.merge(firmographic, on = 'Card_ID' )
df

<a id='duplicates'></a>

### 1.3. Check for duplicates

__`Step`__ Drop any duplicate rows present in the dataframe with the method `drop_duplicates()`

In [None]:
df.drop_duplicates(inplace=True)
df

<a id='index'></a>

### 1.4. Set Index 

__`Step`__ Define the variable "Card_ID" as the index of the dataframe using the method `set_index()`.

In [None]:
df.set_index('Card_ID', inplace = True)
df

<a id='preprocess'></a>

# <font color = '#cc9900'> 2. Preprocess Data </font>

<a id='clean'></a>

## 3.1. Data Cleaning

__`Step`__ Create new variable 'Age' as it is more insightfull than 'YearBirth'. Substitute 'Age' by 'Year_Birth'.

In [None]:
from datetime import date
df['Age'] = date.today().year - df['Year_Birth'].copy()
df.drop(columns='Year_Birth', inplace=True)
first_col="Age"
col_age=df.pop("Age")
df.insert(1,first_col,col_age)
df

__`Step`__ 

<a id='explore'></a>

# <font color = '#cc9900'> 2. Data Exploration</font>

In [None]:
df.shape

In [None]:
df.columns

In [30]:
pd.set_option('display.max_columns', None)
df.head(10)

Unnamed: 0_level_0,Name,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Region,Country,Dt_Customer,Recency,MntGroceries,MntStationery,MntHouseKeeping,MntWellness_&_Beauty,MntElectronics_&_Supplies,MntLimitedEdition,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp1,AcceptedCmp2,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,Complain
Card_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
10003075,Mr. Adam Glover,1981,Master,Together,90782.0,0,0,Cork,Ireland,11/3/2018,66,622.0,70.0,678.0,,51.0,34.0,1,12,7,10,4,0,1,0,0,0,0
10003076,Mr. Cameron McDonald,1993,PhD,Single,113023.0,0,0,Kinsale,Ireland,4/18/2019,6,1014.0,15.0,643.0,74.0,36.0,36.0,1,9,4,8,1,0,0,0,0,0,0
10003078,Mr. Keith Davidson,1982,PhD,Single,93571.0,0,1,Cork,Ireland,7/22/2018,10,639.0,88.0,185.0,64.0,53.0,160.0,3,13,5,13,5,0,0,0,0,0,0
10003079,Mr. Alexander Gill,1978,PhD,Single,91852.0,0,1,Kinsale,Ireland,2/3/2018,26,806.0,56.0,350.0,54.0,54.0,260.0,1,15,9,11,6,0,0,0,1,0,0
10003080,Mr. Neil Piper,1968,PhD,Married,22386.0,1,1,Cork,Ireland,10/31/2018,65,32.0,1.0,29.0,2.0,0.0,20.0,5,9,3,3,9,1,0,0,0,0,0
10003081,Mr. Adrian Walsh,1979,PhD,Married,69485.0,1,1,Killarney,Ireland,12/7/2017,73,293.0,24.0,49.0,4.0,33.0,12.0,7,11,4,6,7,0,0,0,0,0,0
10003083,Mr. Carl Baker,1988,PhD,Single,109499.0,0,0,Cork,Ireland,10/31/2018,75,639.0,126.0,539.0,152.0,152.0,152.0,1,11,7,11,1,0,0,1,0,0,0
10003085,Mr. Liam Hemmings,1987,PhD,Married,97492.0,0,0,Killarney,Ireland,3/8/2019,93,344.0,123.0,281.0,83.0,46.0,28.0,1,9,5,5,1,0,0,0,0,0,0
10003089,Miss Alexandra Tucker,1990,PhD,Single,50289.0,2,0,Killarney,Ireland,11/24/2017,31,11.0,0.0,7.0,4.0,2.0,6.0,1,7,2,2,9,0,0,0,0,0,0
10003091,Mr. Frank Coleman,1953,Master,Widow,20043.0,0,0,Killarney,Ireland,4/17/2019,37,0.0,4.0,6.0,5.0,0.0,5.0,1,7,2,3,3,0,0,0,0,0,0


In [None]:
#df['Age'].sort_values(ascending=False)
df['Age'].value_counts().sort_index(ascending=False)

In [None]:
df['Education']=df['Education'].str.upper() 
df['Education'].value_counts()

for i in range(len(df)):
    if 'MA' in df['Education'][df.index[i]]: 
        df['Education']=df['Education'].replace([df['Education'][df.index[i]]],'MASTERS')
    elif 'PH' in df['Education'][df.index[i]]:
         df['Education']=df['Education'].replace([df['Education'][df.index[i]]],'PHD')
    elif 'GR' in df['Education'][df.index[i]]: 
        df['Education']=df['Education'].replace([df['Education'][df.index[i]]],'GRADUATE')
    else: 
        df['Education']=df['Education'].replace([df['Education'][df.index[i]]],'HIGHSCHOOL')

df['Education'].value_counts()

In [None]:
df['Marital_Status']=df['Marital_Status'].replace("Together","Married")
df['Marital_Status']=df['Marital_Status'].replace(["Divorced","Widow"],"Separated")
df["Marital_Status"].value_counts()

In [None]:
df["Income"].value_counts().sort_values(ascending=True)

In [None]:
# df['Kidhome'].value_counts().sort_index(ascending=False)
# df['Teenhome'].value_counts().sort_index(ascending=False)
# df['Region'].value_counts().sort_index(ascending=False)
# df['Country']=df['Country'].replace(["UK","France"],"Ireland")
# df['Country'].value_counts().sort_index(ascending=False)
df['Dt_Customer'].value_counts().sort_index(ascending=True)

In [128]:
# Extract Gender
index=df.index

a=[]
a=df['Name']
a=df['Name'].to_numpy()
a.reshape(-1,1)

b=[]
for x in range(len(a)):
        b.append(str(a[x]).split()[0])

df['Gender']=b        
genro="Gender"
col_gen=df.pop("Gender")
df.insert(1,genro,col_gen)
df

df['Gender']=df['Gender'].replace(["nan"],"Mr.")
df["Gender"].value_counts().sort_values(ascending=True)

df

Unnamed: 0_level_0,Name,Gender,Age,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Region,Country,Dt_Customer,Recency,MntGroceries,MntStationery,MntHouseKeeping,MntWellness_&_Beauty,MntElectronics_&_Supplies,MntLimitedEdition,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp1,AcceptedCmp2,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,Complain
Card_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
10003075,Mr. Adam Glover,Mr.,40,1981,Master,Together,90782.0,0,0,Cork,Ireland,11/3/2018,66,622.0,70.0,678.0,,51.0,34.0,1,12,7,10,4,0,1,0,0,0,0
10003076,Mr. Cameron McDonald,Mr.,28,1993,PhD,Single,113023.0,0,0,Kinsale,Ireland,4/18/2019,6,1014.0,15.0,643.0,74.0,36.0,36.0,1,9,4,8,1,0,0,0,0,0,0
10003078,Mr. Keith Davidson,Mr.,39,1982,PhD,Single,93571.0,0,1,Cork,Ireland,7/22/2018,10,639.0,88.0,185.0,64.0,53.0,160.0,3,13,5,13,5,0,0,0,0,0,0
10003079,Mr. Alexander Gill,Mr.,43,1978,PhD,Single,91852.0,0,1,Kinsale,Ireland,2/3/2018,26,806.0,56.0,350.0,54.0,54.0,260.0,1,15,9,11,6,0,0,0,1,0,0
10003080,Mr. Neil Piper,Mr.,53,1968,PhD,Married,22386.0,1,1,Cork,Ireland,10/31/2018,65,32.0,1.0,29.0,2.0,0.0,20.0,5,9,3,3,9,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10013063,Mr. Jack Ince,Mr.,36,1985,Graduation,Married,28144.0,1,0,Cork,Ireland,3/21/2018,41,6.0,18.0,13.0,12.0,0.0,28.0,2,8,2,3,7,0,0,0,0,0,0
10013064,Mr. Eric Davies,Mr.,54,1967,PhD,Married,104990.0,0,1,Cork,Ireland,3/1/2018,75,1234.0,80.0,424.0,0.0,135.0,173.0,3,8,7,7,6,0,0,0,0,0,0
10013067,Mr. Justin Rampling,Mr.,64,1957,PhD,Divorced,87399.0,0,0,Killarney,Ireland,8/5/2018,1,562.0,66.0,248.0,80.0,10.0,150.0,1,15,4,11,5,0,0,0,0,0,0
10013068,Mr. Kevin Lee,Mr.,38,1983,graduation,Together,94367.0,0,1,Kinsale,Ireland,9/24/2017,1,731.0,44.0,207.0,32.0,42.0,41.0,2,8,4,9,7,0,0,0,0,0,0
