# 4.9 Visualizing Data Part 1

#### Points of this Script:
1. Imports
2. Data wrangling
3. Consistency checks
4. Combining dataframes
5. Addressing Memory
6. Exports

### 1. Imports

In [None]:
# Importing libraries

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [None]:
# Importing data frame

path = r'C:\Users\walls\Documents\Coding\Data Analysis\CareerFoundry\Data Immersion A4\Instacart Basket Analysis 01-25'
df_customer = pd.read_csv(os.path.join(path, 'Data', 'Original Data', 'customers.csv'), index_col = False)

### 2. Data Wrangling

In [None]:
df_customer.head()

In [None]:
df_customer.shape

In [None]:
df_customer.tail()

##### Observations:

1. "surnam" - misspelled
2. "STATE" - all caps
3. "Age" "First Name" "Gender- cap
4. "n_dependants" vague

In [None]:
df_customer.info()

##### Observations:
1. user_id is numeric, though it should act as a name/object
2. 10 columns, 206209 rows

In [None]:
# Changing column names and applying lower case lettering

df_customer = df_customer.rename(columns=str.lower)
df_customer.rename(columns = {'first name' : 'first_name', 'surnam' : 'surname', 'Gender' : 'gender', 'STATE' : 'state', 'Age' : 'age', 'n_dependants' : 'num_dependants', 'fam_status' : 'marriage_status'}, inplace = True)

In [None]:
df_customer.head()

In [None]:
# Changing "user_id" data type

df_customer['user_id'] = df_customer['user_id'].astype('str')

In [None]:
df_customer.info()

### 3. Consistency Check

In [None]:
df_customer.describe()

In [None]:
# Checking for mixed-type columns

for col in df_customer.columns.tolist():
  weird = (df_customer[[col]].map(type) != df_customer[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_customer[weird]) > 0:
    print (col)

In [None]:
# Changing column to str data type

df_customer['first_name'] = df_customer['first_name'].astype('str')

In [None]:
for col in df_customer.columns.tolist():
  weird = (df_customer[[col]].map(type) != df_customer[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_customer[weird]) > 0:
    print (col)

In [None]:
# Checking for missing values
df_customer.isnull().sum()

In [None]:
# Checking for duplicates

df_dups = df_customer[df_customer.duplicated()]

### 4. Combining Data Frames

In [None]:
# Importing previously prepared data 

df_op = pd.read_pickle(os.path.join(path, 'Data','Prepared Data', 'ords_prods_merge_agg.pkl'))

In [None]:
df_op.head(2)

In [None]:
df_op.shape

In [None]:
df_op.info()

In [None]:
# Converting column "user_id" to str for merge

df_op['user_id'] = df_op['user_id'].astype('str')

In [None]:
# Merging customer dataframe with prepared data dataframe with "user_id" as key

df_customer_op_merged = df_customer.merge(df_op, on = 'user_id')

In [None]:
df_customer_op_merged.head()

In [None]:
df_customer_op_merged.shape

### 5. Addressing Memory

In [None]:
# Check min and max of int64 types

df_customer_op_merged.select_dtypes(include=['int64']).agg(['min', 'max'])

In [None]:
# Changing column datatypes for memory
df_customer_op_merged[['product_id', 'order_id','income', 'avg_price']] = df_customer_op_merged[['product_id', 'order_id', 'income', 'avg_price']].astype('int32')
df_customer_op_merged[['aisle_id','department_id', 'order_count', 'orders_day_of_week', 'order_hour_of_day', 'add_to_cart_order', 'reordered', 'age', 'num_dependants', 'max_order']] = df_customer_op_merged[['aisle_id','department_id', 'order_count', 'orders_day_of_week', 'order_hour_of_day', 'add_to_cart_order', 'reordered', 'age', 'num_dependants', 'max_order']].astype('int16')

In [None]:
df_customer_op_merged.info()

##### Summary
1. df_customer newly imported
2. df_customer shape (206209, 10)
3. 7 columns name change -- first_name, surname, gender, state, age, num_dependants, marraige_status
4. user_id and first_name changed to str
5. df_customer merged with df_op as df_customer_op_merged
6. df_op shape (32404859, 24)
7. Missing values found in df_op 
8. df_customer_op_merged shape (32404859, 33)
9. int64 dtypes changed to int32 or int16 for memory

### 6. Exports

In [None]:
# exporting new df as a pkl
df_customer_op_merged.to_pickle(os.path.join(path, 'Data', 'Prepared Data', 'customer_merged.pkl'))