## Data Cleaning

In [None]:
import pandas as pd
import numpy as np

### Regex exmaples

In [None]:
import re
text = "Hello World 123456"
reg = re.compile(r'[A-Za-z]')
reg.findall(text)

['H', 'e', 'l', 'l', 'o', 'W', 'o', 'r', 'l', 'd']

In [None]:
text = "Hello World 123456"
reg = re.compile(r'[A-Za-z]*')
reg.findall(text)

['Hello', '', 'World', '', '', '', '', '', '', '', '']

In [None]:
text ='My id is 75555688'
reg = re.compile(r'[0-9]*')
reg.findall(text)

['', '', '', '', '', '', '', '', '', '75555688', '']

<ul>
  <li>\d Any numeric digit from 0 to 9.</li>
  <li>\w Any letter, numeric digit, or the underscore character</li>
  <li>\s Any space, tab, or newline character.</li>
</ul>



In [None]:
text ='My id is 75555688'
reg = re.compile(r'\d*')
reg.findall(text)

['', '', '', '', '', '', '', '', '', '75555688', '']

In [None]:
text ='My id is 75555688'
reg = re.compile(r'\w*')
reg.findall(text)

['My', '', 'id', '', 'is', '', '75555688', '']

In [None]:
text ='My id is 75555688'
reg = re.compile(r'\s')
reg.findall(text)

[' ', ' ', ' ']

### Data exploration

In [None]:
df = pd.read_csv("cookies_modified.csv")

In [None]:
df.head()

Unnamed: 0,Date,Day,Temperature,Salesman,Tweets,Price,Sales
0,1/1/2019,Tuesday,72.0,John,2.0,0.5,
1,,,,,,,
2,1/3/2019,Thursday,,John,5.0,0.5,172.0
3,1/4/2019,Friday,100.0,John,7.0,0.5,150.0
4,,,,,,,


In [None]:
df.shape

(50, 7)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         46 non-null     object 
 1   Day          46 non-null     object 
 2   Temperature  45 non-null     float64
 3   Salesman     44 non-null     object 
 4   Tweets       46 non-null     object 
 5   Price        46 non-null     float64
 6   Sales        44 non-null     object 
dtypes: float64(2), object(5)
memory usage: 2.9+ KB


In [None]:
df.dropna(how='all',inplace = True)

### explore Sales , and clean 

In [None]:
df['Sales'].unique()

array([nan, '172', '150', '96', '177', '190', '154', '160', '161', '185',
       '137', '$', '85', '122', '187', '165', '158', '192', '87', '156',
       '84', '125', '166', '198', '105', '121', '194', '94', '193', '114',
       '110', '162', '141', '174', '142', '126', '199', '200', 'hello'],
      dtype=object)

In [None]:
df['Sales'] = df['Sales'].replace('[^0-9]',np.NaN, regex=True)

In [None]:
df['Sales'].unique()

array([nan, '172', '150', '96', '177', '190', '154', '160', '161', '185',
       '137', '85', '122', '187', '165', '158', '192', '87', '156', '84',
       '125', '166', '198', '105', '121', '194', '94', '193', '114',
       '110', '162', '141', '174', '142', '126', '199', '200'],
      dtype=object)

In [None]:
df['Sales'] = df['Sales'].replace(np.NaN,0)

In [None]:
df['Sales'].unique()

array([0, '172', '150', '96', '177', '190', '154', '160', '161', '185',
       '137', '85', '122', '187', '165', '158', '192', '87', '156', '84',
       '125', '166', '198', '105', '121', '194', '94', '193', '114',
       '110', '162', '141', '174', '142', '126', '199', '200'],
      dtype=object)

In [None]:
df.columns

Index(['Date', 'Day', 'Temperature', 'Salesman', 'Tweets', 'Price', 'Sales'], dtype='object')

### explore Salesman , and perform cleaning

In [None]:
df['Salesman'].unique()

array(['John', '  John', '  Ada', 'Ada', '100', '101', nan], dtype=object)

In [None]:
df['Salesman'] = df['Salesman'].replace('[^A-Za-z]',np.NaN, regex=True)

In [None]:
df['Salesman'].unique()

array(['John', nan, 'Ada'], dtype=object)

In [None]:
for col in df.columns:
    print(col+ ":\n")
    print (df[col].unique())

Date:

['1/1/2019' '1/3/2019' '1/4/2019' '1/6/2019' '1/7/2019' '1/9/2019'
 '1/10/2019' '1/11/2019' '1/12/2019' '1/13/2019' '1/14/2019' '1/15/2019'
 '1/16/2019' '1/17/2019' '1/18/2019' '1/19/2019' '1/20/2019' '1/21/2019'
 '1/22/2019' '1/23/2019' '1/24/2019' '1/25/2019' '1/26/2019' '1/27/2019'
 '1/28/2019' '1/29/2019' '1/30/2019' '1/31/2019' '2/1/2019' '2/2/2019'
 '2/3/2019' '2/4/2019' '2/5/2019' '2/6/2019' '2/7/2019' '2/8/2019'
 '2/9/2019' '2/11/2019' '2/12/2019' '2/13/2019' '2/14/2019' '2/15/2019'
 '2/16/2019' '2/17/2019' '2/18/2019' '2/19/2019']
Day:

['Tuesday' 'Thursday' 'Friday' 'Sunday' 'Monday' 'Wednesday' 'Saturday']
Temperature:

[ 72.  nan 100.  91.  81.  69.  61.  79.  94.  80.  64.  68.  74.  89.
  87.  82.  60.  75.  96.  84.  92.  65.  67.  70.  86.  71.  90.  78.
  85.  83.]
Salesman:

['John' nan 'Ada']
Tweets:

['2' '5' '7' '8' '3' '10' '1' 'H' '6' '4' '0' '9']
Price:

[0.5 0.3]
Sales:

[0 '172' '150' '96' '177' '190' '154' '160' '161' '185' '137' '85' '122'
 '187' '165

### Datetime conversion

In [None]:
df['Date'] = pd.to_datetime(df['Date'])

In [None]:
df['Tweets'].unique()

array(['2', '5', '7', '8', '3', '10', '1', 'H', '6', '4', '0', '9'],
      dtype=object)

In [None]:
df['Tweets'] = df['Tweets'].replace('[^0-9]', np.NaN, regex=True)

In [None]:
df['Tweets'].unique()

array(['2', '5', '7', '8', '3', '10', '1', nan, '6', '4', '0', '9'],
      dtype=object)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46 entries, 0 to 49
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Date         46 non-null     datetime64[ns]
 1   Day          46 non-null     object        
 2   Temperature  45 non-null     float64       
 3   Salesman     38 non-null     object        
 4   Tweets       45 non-null     object        
 5   Price        46 non-null     float64       
 6   Sales        46 non-null     object        
dtypes: datetime64[ns](1), float64(2), object(4)
memory usage: 2.9+ KB


In [None]:
df['Sales'].unique()

array([0, '172', '150', '96', '177', '190', '154', '160', '161', '185',
       '137', '85', '122', '187', '165', '158', '192', '87', '156', '84',
       '125', '166', '198', '105', '121', '194', '94', '193', '114',
       '110', '162', '141', '174', '142', '126', '199', '200'],
      dtype=object)

In [None]:
df['Sales'] = df['Sales'].replace(0,np.NaN)

In [None]:
df.dropna(inplace =True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34 entries, 3 to 48
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Date         34 non-null     datetime64[ns]
 1   Day          34 non-null     object        
 2   Temperature  34 non-null     float64       
 3   Salesman     34 non-null     object        
 4   Tweets       34 non-null     object        
 5   Price        34 non-null     float64       
 6   Sales        34 non-null     object        
dtypes: datetime64[ns](1), float64(2), object(4)
memory usage: 2.1+ KB


### Datatype conversions 

In [None]:
df['Tweets'] = df['Tweets'].astype('int')

In [None]:
df['Sales'] = df['Sales'].astype('float64')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34 entries, 3 to 48
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Date         34 non-null     datetime64[ns]
 1   Day          34 non-null     object        
 2   Temperature  34 non-null     float64       
 3   Salesman     34 non-null     object        
 4   Tweets       34 non-null     int32         
 5   Price        34 non-null     float64       
 6   Sales        34 non-null     float64       
dtypes: datetime64[ns](1), float64(3), int32(1), object(2)
memory usage: 2.0+ KB


In [None]:
### Write the clean file

In [None]:
df.to_csv("cookie_clean.csv")

### Can we data engineer Temperature column to different categories , i.e. cold, hot, warm, mild etc for modeling purposes ? 

In [None]:
df['Temperature'].unique()

array([100.,  81.,  69.,  61.,  79.,  80.,  64.,  94.,  74.,  89.,  87.,
        82.,  60.,  75.,  84.,  92.,  65.,  70.,  86.,  71.,  90.,  68.,
        78.,  85.,  96.])

In [None]:
def convert_to_category(temp):
    try:
        temp = int(temp)
        if temp > 100:
            return "Hot"
        elif temp < 100 and temp > 65:
            return "Mild"
        else:
            return "cold"
    except:
        return "Not Valid"
df["Temp_category"] = df['Temperature'].apply(convert_to_category)