# Object Oriented Programming in Data Cleaning
***
_**Just a quick disclaimer: This project is intentionally overengineered for the purpose of showing my OOP and coding skills. Cleaning data with Python is far simpler than this.**_
***

In [43]:
import pandas as pd

In [44]:
class Cleaner:

    def CleanCatData(self,data):
        '''
        # Cleans Categorical Data

        ***

        Takes in a Pandas data frame and uses a hard-coded regular
        expression to remove any symbols, digits, and any spaces greater than
        or equal to a double space.
        ***
        It switches all letters to lower-case, cleans with the regex,
        capitalizes the first letter of the word, and strips away
        any leading or trailing whitespace.
        
        * Note: .strip() appears to be a redundancy as some IDE's will
        remove leading and trailing whitespace for readability whether 
        you clean it or not.'''
        cleaned = data.str.lower().str.replace(r'[\W\d\s+]','',regex=True).str.title().str.strip()
        return cleaned
    
    def CleanNumData(self,data):
        '''Takes in a Pandas dataframe of numeric values stored as object
        and converts them to numeric form.
        
        Returns an numeric data frame.'''
        cleaned = pd.to_numeric(data)
        return cleaned
    
    def ObjectToCategory(self,data):
        '''Takes in a Pandas data frame.
        
        Returns data frame as category data type.'''
        cat = data.astype('category')
        return cat
    
    def SimpleMovingAverage(self,data,i=None):
        '''Creates a column for a simple moving average.
        Takes in a Pandas data frame as well as the the
        number of rolling periods (i).
        
        If no value for i is given, it defaults to 3.
        
        Returns a moving average column.'''
        if i == None:
            i = 3
            mov = data.rolling(window=i).mean()
        else:
            mov = data.rolling(window=i).mean()
        
        return mov

In [45]:
df = pd.DataFrame(
    {
        'Name' : [' C )(*&h    r 6789i(****s))',
                  ' d A &^%$#l 9876e',
                  'rI9cK  ',
                  's @a m  ',
                  'sTEV  e9'],
        'Id' : ['101','102','103','104','105'] # Pandas will handle this as num data if it's used as such
    }
)

In [46]:
# Viewing the data
print(df.head())

# Viewing the information
df.info()

                          Name   Id
0   C )(*&h    r 6789i(****s))  101
1             d A &^%$#l 9876e  102
2                      rI9cK    103
3                     s @a m    104
4                     sTEV  e9  105
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    5 non-null      object
 1   Id      5 non-null      object
dtypes: object(2)
memory usage: 212.0+ bytes


In [47]:
# Create an instance of the Cleaner class
cleaner = Cleaner()

In [48]:
# Cleaning 'Name'
df['Name'] = cleaner.CleanCatData(df['Name'])

# Checking work
print(df.head())

# Viewing data types
df.info()

    Name   Id
0  Chris  101
1   Dale  102
2   Rick  103
3    Sam  104
4  Steve  105
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    5 non-null      object
 1   Id      5 non-null      object
dtypes: object(2)
memory usage: 212.0+ bytes


In [49]:
# Convert 'Name' to 'category' datatype
df['Name'] = cleaner.ObjectToCategory(df['Name'])

# Checking work
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   Name    5 non-null      category
 1   Id      5 non-null      object  
dtypes: category(1), object(1)
memory usage: 389.0+ bytes


In [50]:
# Changing 'Id' to numeric data type
df['Id'] = cleaner.CleanNumData(df['Id'])

# Checking work
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   Name    5 non-null      category
 1   Id      5 non-null      int64   
dtypes: category(1), int64(1)
memory usage: 389.0 bytes


In [51]:
# Creating a simple moving average
df['MovAvg'] = cleaner.SimpleMovingAverage(df['Id'],i=3)

# Checking work
print(df.head())
df.info()

    Name   Id  MovAvg
0  Chris  101     NaN
1   Dale  102     NaN
2   Rick  103   102.0
3    Sam  104   103.0
4  Steve  105   104.0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   Name    5 non-null      category
 1   Id      5 non-null      int64   
 2   MovAvg  3 non-null      float64 
dtypes: category(1), float64(1), int64(1)
memory usage: 429.0 bytes
