### Read Dataset

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

path = 'D:\BIM\Summer Project\datasets\ibm-dataset.csv'
# Read dataset
df = pd.read_csv(path)

pd.options.display.max_columns = None

df.head()

# Fix pie chart issue
# income_counts = df['Department'].value_counts()
# plt.pie(x=income_counts, autopct='%1.1f%%')
# plt.show()





### Clean Dataset

In [None]:
# Check missing values
# print(df.columns[df.isnull().sum() > 1])

# Drop missing values (apparently none)
df = df.dropna()

# Check missing values after dropping
print(df.columns[df.isnull().sum() > 1])

# Remove duplicates
df = df.drop_duplicates()

# Convert Data Types
print(df['Age'].dtype)

### Convert to Binary or One Hot Encoding

In [None]:
# Convert to Binary: Attrition, Gender, Over18, OverTime
df['Attrition'] = df['Attrition'].map({'Yes': 1, 'No': 0})
df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})
df['Over18'] = df['Over18'].map({'Y': 1, 'N': 0})
df['OverTime'] = df['OverTime'].map({'Yes': 1, 'No': 0})

# Convert to One Hot Encoding: BusinessTravel, Department, EducationField, JobRole, MaritalStatus, StockOptionLevel
df = df.join(pd.get_dummies(df['BusinessTravel'])).drop('BusinessTravel', axis=1)
df = df.join(pd.get_dummies(df['Department'], prefix='Dept')).drop('Department', axis=1)
df = df.join(pd.get_dummies(df['EducationField'], prefix='EduField')).drop('EducationField', axis=1)
df = df.join(pd.get_dummies(df['JobRole'], prefix='JobRole')).drop('JobRole', axis=1)
df = df.join(pd.get_dummies(df['MaritalStatus'])).drop('MaritalStatus', axis=1)

#Convert One Hot Encoding to Binary
df = df.map(lambda x: 1 if x is True else 0 if x is False else x)
df


### Visualizing and Removing Unnecessary Columns

In [None]:
# Plot histograms to find irrelevant columns
df.hist(figsize=(20, 15))
plt.tight_layout()
plt.show()

In [None]:
df.drop(['EmployeeCount', 'Over18', 'StandardHours'], axis=1)
df

### Save

In [None]:
# Save cleaned data as CSV
df.to_csv('D:\BIM\Summer Project\datasets\cleaned_ibm_dataset.csv', index=False)

# Save cleaned data as PKL
df.to_pickle('D:\BIM\Summer Project\datasets\cleaned_ibm_dataset.pkl')