# Module 1 Python Data Analysis Libraries


In [None]:
!pip install numpy
!pip install matplotlib
!pip install seaborn
!pip install pandas
!pip install scipy
!pip install sklearn

In [None]:
import numpy as np

In [None]:
import matplotlib.pyplot as plt

In [None]:
import seaborn as sb

In [None]:
import pandas as pd

In [None]:
import sklearn

# Module 2 Introduction to Numpy

In [None]:
# Limitation of Python List
a = [1,1,1]
b = [2,2,2]
a-b

In [None]:
a1 = np.array(a)
b1 = np.array(b)
a1-b1

In [None]:
# Python List and Numpy Array data types
a = [1,'hi',2]
a1 = np.array(a)

## Array Attributes

In [None]:
a1.dtype

In [None]:
a = [1,1,1]
a1 = np.array(a)
a1.dtype

In [None]:
a = [1,1,1]
a1 = np.array(a,dtype=np.float32)
a1.dtype

In [None]:
a1.ndim

## Special 1D Functions

In [None]:
# Python way
a = []
for i in range(1,100,1):
    a.append(i)

In [None]:
a = np.arange(1,100,1)

In [None]:
a = np.linspace(1,100,200)

In [None]:
a = np.zeros((2,3),dtype=np.int16)
a

In [None]:
a = np.ones((2,3))*8
a

In [None]:
a = np.random.randn(10)*2+10
a

### Exercise

In [None]:
a = np.arange(99,3,-3)
a

In [None]:
a = np.linspace(99,3,100)
a

## Higher Dimension Arrays

In [None]:
a = [[1,2],[3,4]]
a

In [None]:
a1 = np.array(a,dtype=np.float32)
a1

In [None]:
a1.ndim

In [None]:
a1.dtype

### Exercise

In [None]:
a = [[[1,1],[2,2]],[[3,3],[4,4]]]
a1 = np.array(a)

In [None]:
a1.shape

In [None]:
# Math Functions
a = np.exp(2)
a = np.sqrt(4)
a = np.sin(np.pi/2)
a = np.cos(np.pi/2)

## Silicing and Selecting Elements

In [None]:
a = np.arange(2,20,1)

print(a[2:7])
print(a[:3])
print(a[3:])
print(a[:])
print(a[::2])
print(a[-1])


## Logical Indexing and Filtering Elements

In [None]:
# Python way
a = [1,-2,3,-4,5,-6,7,-8]
b = []
for i in a:
    if i<0:continue
    b.append(i)

In [None]:
b = filter(lambda x:x>0,a)
list(b)

In [None]:
# Numpy way
a1 = np.array(a)
a1>0

In [None]:
a1[a1>0]

### Exercise

In [None]:
a = np.arange(1,100,1)

In [None]:
a[a%3!=0]

## Transforming Data

In [None]:
a = np.arange(24).reshape(6,4)
a = np.arange(24).reshape(6,-1)
a = np.arange(24).reshape(-1,6)

In [None]:
a = np.arange(24).reshape(2,2,-1)

In [None]:
a.ndim

In [None]:
a.shape

In [None]:
b = a.ravel()

In [None]:
a.shape

In [None]:
## Statistical function

In [None]:
a = np.arange(1,10,1).reshape(3,3)
np.mean(a,axis=0)
np.mean(a,axis=1)

# Module 3 Data Analysis with Pandas

## Create DataFrame

In [None]:
a = {
    'Name' : ["Ally","Belinda","Jane","Steve"],
    'Height' : [160,165,155,180],
    'Gender' : ['F','F','F','M']
}

In [None]:
df2 = pd.DataFrame(a)

## Import Data

In [None]:
# Import CSV file
df = pd.read_csv('data/sample.csv')
df

In [None]:
# Import Excel File

!pip3 install xlrd
df = pd.read_excel('data/sample.xlsx')

## Dataframe attributes

In [None]:
print(df.shape)
print(df.columns)
print(df.index)
print(df.values)

### Exercise

In [None]:
mtcars = pd.read_csv('data/mtcars.csv',index_col='car_names',usecols=['car_names','mpg','cyl','hp','am'])

In [None]:
mtcars.head(10)

In [None]:
mtcars.tail()

## Select Colume Data

In [None]:
df['Name']

In [None]:
df[['Name']]

In [None]:
df.Name

In [None]:
df[['Name','Gender']]

### Exercise

In [None]:
mpg_cyl = mtcars[['cyl','hp']]
mpg_cyl

## Select Row Data

In [None]:
df.iloc[0]

In [None]:
df.loc[0]

In [None]:
df.index = ['001','002','003']
df

In [None]:
df.loc['001']

### Exercise

In [None]:
mtcars2 = mtcars.loc[['Honda Civic ','Porsche 914-2']]
mtcars2

## Append Column Data and Export Data

In [None]:
df['Food'] = ['Veg','Non-Veg','Non-Veg','Non-Veg','Veg']

In [None]:
df.to_csv('data/sample2.csv')

### Exercise

In [None]:
# Export CSV data
mtcars2.to_csv('data/mtcars2.csv')

## Select Cell Data

In [None]:
df['Name'][0]

In [None]:
df[['Height']].loc['002']

In [None]:
df.loc['001'][0]

## Slicing

In [None]:
df.iloc[[0,1]]

In [None]:
df.iloc[0:2]

In [None]:
df[['Name','Gender']].iloc[[0,2]]

## Filtering Data

In [None]:
df.Gender=='F'

In [None]:
df[df.Gender=='F']

### Exercise

In [None]:
df2[df2.mpg>20]

In [None]:
df3 = df2[(df2.mpg>20)&(df2.am==1)&(df2.cyl<6)]

## Data Cleaning

### Missing Data

In [None]:
missing = pd.read_csv('data/missing.csv')

In [None]:
missing.isnull()

In [None]:
missing.isnull().sum()

In [None]:
missing.fillna(method='ffill')

In [None]:
missing.fillna(method='bfill')

In [None]:
missing.fillna({'Height':0,'Food':'Non-Veg'})

In [None]:
missing.fillna({'Food':'Non-Veg'}).fillna(method='bfill')

In [None]:
df.fillna({'Food':'Non-Veg'}).fillna(df.mean())

In [None]:
missing.dropna()

### Duplicates

In [None]:
missing.duplicated()

In [None]:
missing.drop_duplicates()

In [None]:
missing.dropna().drop_duplicates()

In [None]:
missing.drop('Food',axis=1)

### Exercise

In [None]:
mtcars.drop(['am','gear'],axis=1).drop(['Merc 230','Ferrari Dino'])

## Joining Data

### Append

In [None]:
df2 = pd.read_csv('data/sample3.csv')

In [None]:
df.append(df2)

In [None]:
df.append(df2,ignore_index=True)

### Exercise

In [None]:
merc = [c for c in mtcars.index if 'Merc' in c]
merc_cars = mtcars.loc[merc]
merc_cars

In [None]:
toyota = [c for c in mtcars.index if 'Toyota' in c]
toyota_cars = mtcars.loc[toyota]
toyota_cars

In [None]:
toyota_merc_cars2= toyota_cars.append(merc_cars)
toyota_merc_cars2

### Concat

In [None]:
df4=pd.read_csv('data/sample4.csv')

In [None]:
pd.concat([df,df4],axis=1)

### Exercise

In [None]:
toyota_merc_cars = pd.concat([toyota_cars,merc_cars])
toyota_merc_cars

### Merge

In [None]:
df5=pd.read_csv('data/sample5.csv')

In [None]:
df.merge(df5,left_on='Name',right_on='Name2')
df.merge(df5,left_on='Name',right_on='Name2',how='inner')
df.merge(df5,left_on='Name',right_on='Name2',how='outer')
df.merge(df5,left_on='Name',right_on='Name2',how='left')
df.merge(df5,left_on='Name',right_on='Name2',how='right')


### Exercise

In [None]:
mazda = [c for c in df3.index if 'Mazda' in c]
mazda_cars = mtcars.loc[mazda]
mazda_cars

In [None]:
toyota_mazda_cars= pd.concat([toyota_cars,mazda_cars])


In [None]:
toyota_mazda_cars2 = toyota_mazda_cars.reset_index()
toyota_merc_cars2 = toyota_merc_cars.reset_index()
merged =toyota_merc_cars2.merge(toyota_mazda_cars2,how='outer')
merged

## GroupBy

In [None]:
df.groupby(['Gender']).mean()

In [None]:
df.groupby(['Gender']).describe()

In [None]:
df.groupby([['Gender','Food']).mean()

### Exercise

In [None]:
cyl = mtcars.groupby('cyl').mean()

In [None]:
am = mtcars.groupby('am').mean()

In [None]:
cyl_am = mtcars.groupby(['cyl','am']).mean()

In [None]:
mtcars.groupby(['cyl','am']).mean()[['mpg','hp']]

## Aggregation

In [None]:
df.groupby('Gender').agg(lambda x:max(x)-min(x))

In [None]:
df.groupby(['Gender', 'Food']).agg(['mean', 'count'])

### Exercise

In [None]:
mtcars.groupby(['cyl', 'am']).agg(['mean', 'count'])

In [None]:
cyl_am = mtcars.groupby(['cyl','am']).agg(lambda x:max(x)-min(x))

# Module 4 Data Visualization with Matlplotlib

In [None]:
%matplotlib inline

In [None]:
x = np.linspace(0,4*np.pi,200)
y = np.sin(x)
y2 = np.cos(x)

In [None]:
plt.plot(x,y,color='#eeeeee')
plt.plot(x,y2,color='blue')
plt.xlabel('x')
plt.ylabel('y')
plt.title('sine curve')
plt.show()

In [None]:
import seaborn as sb
from matplotlib import rcParams

rcParams['figure.figsize']=10,4
sb.set_style('darkgrid')

In [None]:
plt.plot(x,y,color='#eeeeee')
plt.plot(x,y2,color='blue')
plt.xlabel('x')
plt.ylabel('y')
plt.title('sine curve')
plt.show()

In [None]:
# Scatter Plot

x = np.linspace(1,10,10)
y = x + np.random.randn(len(x))

plt.scatter(x,y)
plt.show()

In [None]:
# Boxplot
y1 = np.random.randn(100)*2+10
y2 = np.random.randn(100)*1+8
y3 = np.random.randn(100)*2.5+20

plt.boxplot([y1,y2,y3])
plt.show()

In [None]:
# Pandas Plot
mtcars[['mpg','hp']].plot()

In [None]:
cyl = mtcars[['mpg','hp','cyl']].groupby('cyl').mean()
plt.plot(cyl)

# Module 5 Overview of Machine Learning with Scikit Learn

## Supervised Learning

### Classification

In [None]:
# Step 1 Load Data
from sklearn import datasets
iris = datasets.load_iris()

In [None]:
X = iris.data
y = iris.target

In [None]:
X

In [None]:
len(X)

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)

In [None]:
y_train

In [None]:
# Step 2: Define the Model

from sklearn import neighbors

clf = neighbors.KNeighborsClassifier(n_neighbors=3)

In [None]:
# Step 3 Training

clf.fit(X_train,y_train)

In [None]:
# Step 4 Prediction
clf.predict([[5,4,3,4]])

In [None]:
clf.score(X_test,y_test)

In [None]:
clf.predict(X_test)

In [None]:
y_test

### Regression

In [None]:
import numpy as np
import matplotlib.pyplot as plt 

x = np.linspace(0,10,100).reshape(-1,1)
y = x + np.random.normal(0,1,len(x)).reshape(-1,1)
plt.scatter(x,y)
plt.show()

In [None]:
from sklearn import linear_model

lm = linear_model.LinearRegression()

In [None]:
lm.fit(x,y)

In [None]:
plt.scatter(x,y)
plt.plot(x,lm.predict(x),'r')
plt.show()

## Unsupervised Learning

### Clustering

In [None]:
plt.scatter(X[:,0],X[:,2],c=y)
plt.xlabel('Petal Length')
plt.ylabel('Petal Width')

plt.show()

In [None]:
# Step 2 Define the Model

from sklearn import cluster
clf = cluster.KMeans(n_clusters=3)


In [None]:
# Step 3: Train the Model
clf.fit(X)

In [None]:

plt.subplot(1,2,1)
plt.scatter(X[:,0],X[:,2],c=y)
plt.xlabel('Petal Length')
plt.ylabel('Petal Width')

plt.subplot(1,2,2)

plt.scatter(X[:,0],X[:,2],c=clf.labels_)
plt.xlabel('Petal Length')
plt.ylabel('Petal Width')

plt.show()