# The More The Merrier (Data Cleaning)

**Description:** This notebook is dedicated to the preprocessing and cleaning of raw data stored in *csv* files using the 
*Pandas* library, specifically focusing on three key datasets for this project.

- **Data:** Datasets to clean:
  - `2017_Entry_Exit.csv`
  - `2017_Average_Housing_Prices_in_London.csv`
  - ` LondonUnderground_Stations_Boroughs.csv `


In [1]:
# importing the necessary libraries
import pandas as pd
import numpy as np

In [2]:
def load_data(filepath):
    '''
    This function loads raw data from a csv file into a pandas dataframe and sets the primary key as the index column
    Args:
        filepath: the raw data's filepath in csv format
    Return:
        The loaded raw data into the pandas dataframe ready to be preprocessed
    '''
    df = pd.read_csv(filepath)
    return df 

In [139]:
class data_cleaner:
    def __init__(self, df):
        self.df = df
    
    def get_data(self):
        return self.df
    
    def select_cols(self,cols):
        df = self.df[cols]
        cleaned_data = data_cleaner(df)
        return df

    def adjust_col_dtypes(self,col_dtypes):
        self.df = self.df.astype(col_dtypes)
        cleaned_data = data_cleaner(self.df)
        return self.df
    
    def rename_cols(self,rename):
        self.df.columns = rename
        cleaned_data = data_cleaner(self.df)
        return self.df

In [148]:
df = data_cleaner(raw_df)
df = df.select_cols(['Station_ID', 'Station_Name','AnnualEntryExit_Mill'])
df = data_cleaner(df)
df = df.rename_cols(['id','name','z'])
df = data_cleaner(df)
df = df.adjust_col_dtypes({ 'id' : np.int64,
                           'name' : str,
                           'z' : np.float64
    
})
df.dtypes
#cleaned_data.rename_cols(['id','name','z'])

id        int64
name     object
z       float64
dtype: object

In [116]:
def clean_data(df, cols, col_dtypes, rename):
    '''
    This function does the necessary cleaning of data such as selecting relevant column, adjusting each column's data type
    and simplifying the column names.
    Args:
        df: raw dataframe to be cleaned
        cols: list of relevant columns in ascending order
        col_dtypes: dictionary where each column maps to a data type 
        rename: list of simplified column names in order
    Return:
        The cleaned dataframe ready to be cleaned
    '''
    df = df[cols]
    df.astype(col_dtypes)
    df.columns = rename
    return df

In [4]:
# data preprocessing
def preprocessing_data(df):
    

In [7]:
raw_df = load_data(r'C:\Users\pjxph\Documents\Data Science Projects\The More The Merrier\raw data\2017_Entry_Exit.csv')
# list of all london stations with it's each respective frequency of touch ins and touch outs.

In [13]:
# 2017_Average_hse_price
raw_df = load_data(r'C:\Users\pjxph\Documents/Data Science Projects/The More The Merrier/raw data/2017_Average_Housing_Prices_in_London.csv')
# list of all london boroughs and their respective average house price.
avg_hse_price = clean_data(raw_df,
           ['Area_ID', 'Area_Name', 'average_hse_price'],
           {'Area_ID' : str,
            'Area_Name' : str,
            'average_hse_price' : np.float64},
           ['Id', 'Name', 'Avg_hse_price']
           )

In [14]:
avg_hse_price

Unnamed: 0,Id,Name,Avg_hse_price
0,E09000001,city of london,820305.0
1,E09000002,barking and dagenham,282441.0
2,E09000003,barnet,532924.0
3,E09000004,bexley,330066.0
4,E09000005,brent,472373.0
5,E09000006,bromley,436538.0
6,E09000007,camden,856070.0
7,E09000008,croydon,363241.0
8,E09000009,ealing,489364.0
9,E09000010,enfield,391874.0
