# The machine learning pipeline 
## 1. Processing


#### Data operations in numpy

In [3]:

# more than one dimensions 
import numpy as np 
a = np.array([[1, 2], [3, 4], [5, 6]]) 
print('more than one dimensions')
print(a)
# minimum dimensions 
a = np.array([1, 2, 3,4,5,6], ndmin = 2) 
print('minimum dimensions')
print(a)
# dtype parameter 
a = np.array([1, 2, 3], dtype = complex) 
print('dtype parameter')
print(a)

more than one dimensions
[[1 2]
 [3 4]
 [5 6]]
minimum dimensions
[[1 2 3 4 5 6]]
dtype parameter
[1.+0.j 2.+0.j 3.+0.j]


In [2]:
import pandas as pd
prices=[ 
         [71,72,73],[77,78,70],[79,72,73],
         [74,78,70],[71,88,73],[77,68,70],
         [71,72,73],[77,78,70],[79,72,73],
         [74,78,70],[71,88,73],[77,68,70]
        ]
simple_dataframe=pd.DataFrame(prices)
print("Simple Data Frame\n",simple_dataframe)
header_dataframe=pd.DataFrame(prices,columns=['Jan','Feb','Mar'])
print("Header Data Frame\n",header_dataframe)
print("Fetch unic record\n",header_dataframe['Feb'][0])
print("First five rows are\n")
print(simple_dataframe.head())
print("Last five rows are\n")
print(simple_dataframe.tail())
print("Last five rows with header\n")
print(header_dataframe.tail())


Simple Data Frame
      0   1   2
0   71  72  73
1   77  78  70
2   79  72  73
3   74  78  70
4   71  88  73
5   77  68  70
6   71  72  73
7   77  78  70
8   79  72  73
9   74  78  70
10  71  88  73
11  77  68  70
Header Data Frame
     Jan  Feb  Mar
0    71   72   73
1    77   78   70
2    79   72   73
3    74   78   70
4    71   88   73
5    77   68   70
6    71   72   73
7    77   78   70
8    79   72   73
9    74   78   70
10   71   88   73
11   77   68   70
Fetch unic record
 72
First five rows are

    0   1   2
0  71  72  73
1  77  78  70
2  79  72  73
3  74  78  70
4  71  88  73
Last five rows are

     0   1   2
7   77  78  70
8   79  72  73
9   74  78  70
10  71  88  73
11  77  68  70
Last five rows with header

    Jan  Feb  Mar
7    77   78   70
8    79   72   73
9    74   78   70
10   71   88   73
11   77   68   70


#### Data cleanings

In [5]:
#generate an examples data
import pandas as pd
import numpy as np
df=np.random.randn(5, 3)
print('Generate random data:' ,df)
df = pd.DataFrame(df, index=['a', 'c', 'e', 'f', 'h'],columns=['one', 'two', 'three'])
print('Created dataframe: ',df)
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
print('Reindex:',df)

Generate random data: [[ 0.14937393 -0.10407521 -0.31522088]
 [ 0.93238994 -1.29169207  1.02748957]
 [-1.14666173  0.00576266  0.81772229]
 [-0.67107769  2.29148439  0.42212297]
 [ 0.36860802  1.20143939 -0.62375188]]
Created dataframe:          one       two     three
a  0.149374 -0.104075 -0.315221
c  0.932390 -1.291692  1.027490
e -1.146662  0.005763  0.817722
f -0.671078  2.291484  0.422123
h  0.368608  1.201439 -0.623752
Reindex:         one       two     three
a  0.149374 -0.104075 -0.315221
b       NaN       NaN       NaN
c  0.932390 -1.291692  1.027490
d       NaN       NaN       NaN
e -1.146662  0.005763  0.817722
f -0.671078  2.291484  0.422123
g       NaN       NaN       NaN
h  0.368608  1.201439 -0.623752


In [6]:
# Check for Missing Values
import pandas as pd
import numpy as np
 
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f',
'h'],columns=['one', 'two', 'three'])

df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
print(df['one'].isnull())

a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: one, dtype: bool


In [7]:
# Cleaning / Filling Missing Data
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(3, 3), index=['a', 'c', 'e'],columns=['one','two', 'three'])
print(df)
df = df.reindex(['a', 'b', 'c'])
print(df)
print ("NaN replaced with '0':")
print(df.fillna(0))

        one       two     three
a  0.244636  0.004951 -0.441201
c -1.961109  1.310292 -0.253309
e  0.197359  0.114193 -0.171315
        one       two     three
a  0.244636  0.004951 -0.441201
b       NaN       NaN       NaN
c -1.961109  1.310292 -0.253309
NaN replaced with '0':
        one       two     three
a  0.244636  0.004951 -0.441201
b  0.000000  0.000000  0.000000
c -1.961109  1.310292 -0.253309


In [8]:
# Fill NA Forward and Backward
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f',
'h'],columns=['one', 'two', 'three'])
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
print('Using fill methods')
print(df.fillna(method='pad'))
print('Replace all NaN elements with 0s.')
print(df.fillna(0))
print(df.fillna(method='ffill'))
values = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
print(df.fillna(value=values))

Using fill methods
        one       two     three
a -0.313059  0.313396  0.820121
b -0.313059  0.313396  0.820121
c -0.154770 -0.968054  0.179198
d -0.154770 -0.968054  0.179198
e  0.845823  0.826616  0.094818
f -1.017472 -0.289387 -0.184677
g -1.017472 -0.289387 -0.184677
h  0.593381  0.350253 -1.146741
Replace all NaN elements with 0s.
        one       two     three
a -0.313059  0.313396  0.820121
b  0.000000  0.000000  0.000000
c -0.154770 -0.968054  0.179198
d  0.000000  0.000000  0.000000
e  0.845823  0.826616  0.094818
f -1.017472 -0.289387 -0.184677
g  0.000000  0.000000  0.000000
h  0.593381  0.350253 -1.146741
        one       two     three
a -0.313059  0.313396  0.820121
b -0.313059  0.313396  0.820121
c -0.154770 -0.968054  0.179198
d -0.154770 -0.968054  0.179198
e  0.845823  0.826616  0.094818
f -1.017472 -0.289387 -0.184677
g -1.017472 -0.289387 -0.184677
h  0.593381  0.350253 -1.146741
        one       two     three
a -0.313059  0.313396  0.820121
b       NaN       N

### See dealing with missing values notebook examples

In [11]:
# Drop Missing Values
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f','h'],columns=['one', 'two', 'three'])
print(df)
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
print(df)
print("Drop NaN:",df.dropna())

        one       two     three
a  1.702012  1.951531  0.759594
c -0.978566  0.327051  0.769759
e  0.293319  0.283355  1.261603
f  0.256739 -0.015344 -0.193123
h  0.761929 -0.270097  0.081532
        one       two     three
a  1.702012  1.951531  0.759594
b       NaN       NaN       NaN
c -0.978566  0.327051  0.769759
d       NaN       NaN       NaN
e  0.293319  0.283355  1.261603
f  0.256739 -0.015344 -0.193123
g       NaN       NaN       NaN
h  0.761929 -0.270097  0.081532
Drop NaN:         one       two     three
a  1.702012  1.951531  0.759594
c -0.978566  0.327051  0.769759
e  0.293319  0.283355  1.261603
f  0.256739 -0.015344 -0.193123
h  0.761929 -0.270097  0.081532


In [15]:
#date time values  
import datetime
print ('The DateTime Today is  :', datetime.datetime.today())
print ('The Date Today is  :', datetime.date.today())
print ('This Year  :', datetime.date.today().year)
print ('This Month  :', datetime.date.today().month)
print ('This Day  :', datetime.date.today().day)
print ('Day Name  :', datetime.date.today().strftime('%A'))
print ('Month Name  :', datetime.date.today().strftime('%B'))

The DateTime Today is  : 2020-03-02 10:42:48.046771
The Date Today is  : 2020-03-02
This Year  : 2020
This Month  : 3
This Day  : 2
Day Name  : Monday
Month Name  : March


In [16]:
import datetime
#Capture the First Date
day1 = datetime.date(2018, 2, 12)
#print('day1:', datetime.date(2018, 2, 12).ctime())
print('day1:', day1.ctime())
# Capture the Second Date
day2 = datetime.date(2017, 8, 18)
print('day2:', day2.ctime())
# Find the difference between the dates
print('Number of Days:', day1-day2)
date_today  = datetime.date.today() 
# Create a delta of 6 Days 
no_of_days = datetime.timedelta(days=6)
print('delta of Six Days',no_of_days) 
# Use Delta for Past Date
before_six_days = date_today - no_of_days 
print('Before Six Days:', before_six_days)
# Use Delta for future Date
after_six_days = date_today + no_of_days 
print('After Six Days:', after_six_days) 

day1: Mon Feb 12 00:00:00 2018
day2: Fri Aug 18 00:00:00 2017
Number of Days: 178 days, 0:00:00
delta of Six Days 6 days, 0:00:00
Before Six Days: 2020-02-25
After Six Days: 2020-03-08


In [17]:
import datetime
date_today  = datetime.date.today() 
print('Today is: ', date_today)
# Create a delta of 6 Days 
no_of_days = datetime.timedelta(days=6)
print('delta of Six Days',no_of_days) 
# Use Delta for Past Date
before_six_days = date_today - no_of_days 
print('Before Six Days:', before_six_days)
# Use Delta for future Date
after_six_days = date_today + no_of_days
date1 = datetime.date(2018,12,23)
print('date1:',date1)
if date1 == before_six_days :
    print('Same Dates')
if date_today > date1:
    print('Past Date')
if date1 < after_six_days:
    print('Future Date')

Today is:  2020-03-02
delta of Six Days 6 days, 0:00:00
Before Six Days: 2020-02-25
date1: 2018-12-23
Past Date
Future Date
