## Data Analysis using Python  Tutorial

### Pandas

Pandas is a Python library. It contains high-level data structures and manipulation tools designed to make data analysis fast and easy in Python. Pandas is built on top of NumPy and makes it easy to use in NumPy-centric applications.


In [None]:
1. Creating DataFrames
2. Subsetting Observations (Rows)
3. Subset Variables (Columns)
4. Reshape Data
5. Summarize Data
6. Make New Variables
7. Handling Missing Values
8. Combine Data Sets
9. Grouping
10. Basic Plotting

In [None]:
# Data Science Pipeline
# Machine Learning Pipeline

In [None]:
business requirement --> data collection  --> data exploration (numpy/pandas; visual)--> ML

In [None]:
# General steps involved
1. Reading the data into the work environment (typical python way/Numpy/Pandas)
2. Generic data exploration (basic ETL / Data Analyst/ BI view)
3. data exploration (DS view)

In [None]:
#iris.head()
#iris.pop('Id') # df = df.drop('Id', axis=1)
#iris.shape
#iris.columns
#iris.index
#iris.index.values
#iris.describe()
#iris.sort_values()
# iris = iris.rename(columns = {'xx':'yy'})

In [None]:
# can access/transform/modify/create
-- indexs
-- rows
-- row ids
-- columns
-- column ids
-- cells
-- create new columns
-- delete rows
-- delete columns

In [None]:
# df.sort_values
# df.rename
# df.sort_index
# df.reset_index
# df.drop
# df['col']
# df[['col1','col2']]
# df.sample(frac=)
# df.sample(n)
# df.drop_duplicates()
# df['col'] = df['col'].replace('a','b')

In [None]:
# Advanced Selection
#iris[iris['Species'] == 'Iris-setosa']
#iris[iris['Species'] != 'Iris-setosa'].shape
#iris[(iris['Species'] == 'Iris-setosa') | (iris['Species'] == 'Iris-virginica')].shape
#iris[(iris['Species'] == 'Iris-setosa') & (iris['SepalLengthCm'] < 5.0)].shape
#iris['Species'].isin(['Iris-setosa','ABC'])
#iris[iris['Species'].isin(['Iris-setosa','ABC'])].shape
#iris['Species'].str.startswith('Iris')
#iris['Species'].str.endswith('Iris').shape
#iris['Species'].str.contains('setosa|virginica')
#iris[iris['Species'].str.contains('setosa|virginica')]
#iris['Species'].str.extract('([A-Za-z]+)-([A-Za-z]+)', expand = False)
#iris.filter(regex = '^Sepal')
#iris.filter(regex = 'Cm$')

In [None]:
# Loading Python Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.DataFrame({'A':[1,2,3],'B':[4,5,6],'C':['one','two','three']})
df

Unnamed: 0,A,B,C
0,1,4,one
1,2,5,two
2,3,6,three


In [None]:
# Data Reading in Pandas

# Reading Reading:

pd.read_*(filename)
df.to_*(filename)

CSV     read_csv       to_csv
TSV     read_tsv       to_csv
JSON    read_json      to_json
HTML    read_html      to_html
Excel   read_excel     to_excel
HDF5    read_hdf       to_hdf
Parquet read_parquet   to_parquet
SAS     read_sas
Pickle  read_pickle    to_pickle
SQL     read_sql       to_sql

SyntaxError: invalid syntax (<ipython-input-3-41e580580830>, line 8)

In [None]:
iris = pd.read_csv('/content/iris.csv')
#/Users/ram-bora/Documents/PERSONAL/PERSONAL_LEARNING/DS_TRAININGS/NLP_GENERAL_TRAINING/CSTU-NLP-Course/PYNB
#iris_2 = iris.mask(np.random.random(iris.shape) < .1)

In [None]:
iris.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


In [None]:
iris.shape

(150, 5)

In [None]:
iris.columns.to_list()

['sepal.length', 'sepal.width', 'petal.length', 'petal.width', 'variety']

In [None]:
iris.describe()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [None]:
# Summarize Data
#iris.describe()
iris.filter(regex = '^Sepal').sum()
iris['sepal.length'].sum()
iris['sepal.length'].count()
iris['sepal.length'].mean()
iris['sepal.length'].median() # min(), max(), var(), std()
iris[['sepal.length', 'sepal.width']].sum()

sepal.length    876.5
sepal.width     458.6
dtype: float64

In [None]:
sum = []
sum_d = {}
for col in iris.columns:
    if (col == 'sepal.length') or (col == 'sepal.width'):
        print(iris[col].sum())
        sum.append(iris[col].sum())
print(sum)
sum_dict = pd.DataFrame({'Sum': sum}, index = ['SL','SW'])
sum_dict

876.5
458.6
[876.5, 458.6]


Unnamed: 0,Sum
SL,876.5
SW,458.6


In [None]:
iris.dtypes

sepal.length    float64
sepal.width     float64
petal.length    float64
petal.width     float64
variety          object
dtype: object

In [None]:
iris.isnull().sum()

sepal.length    0
sepal.width     0
petal.length    0
petal.width     0
variety         0
dtype: int64

In [None]:
iris = iris.rename(columns = {'variety':'Species'})
iris['Species'].unique()

array(['Setosa', 'Versicolor', 'Virginica'], dtype=object)

In [None]:
iris['Species'].value_counts()

Species
Setosa        50
Versicolor    50
Virginica     50
Name: count, dtype: int64

In [None]:
#iris_setosa = iris[iris['Species'].str.startswith('Iris-setosa')]
iris_setosa = iris[iris['Species'].str.endswith('Setosa')]
#iris_setosa = iris[iris['variety'].str.contains('Setosa|Virginica')]
iris_setosa.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,Species
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


In [None]:
iris['new'] = iris['Species'].map(lambda x: 0 if x=='Setosa' else x)
iris.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,Species,new
0,5.1,3.5,1.4,0.2,Setosa,0
1,4.9,3.0,1.4,0.2,Setosa,0
2,4.7,3.2,1.3,0.2,Setosa,0
3,4.6,3.1,1.5,0.2,Setosa,0
4,5.0,3.6,1.4,0.2,Setosa,0


In [None]:
iris.iloc[45:55]

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,Species,new
45,4.8,3.0,1.4,0.3,Setosa,0
46,5.1,3.8,1.6,0.2,Setosa,0
47,4.6,3.2,1.4,0.2,Setosa,0
48,5.3,3.7,1.5,0.2,Setosa,0
49,5.0,3.3,1.4,0.2,Setosa,0
50,7.0,3.2,4.7,1.4,Versicolor,Versicolor
51,6.4,3.2,4.5,1.5,Versicolor,Versicolor
52,6.9,3.1,4.9,1.5,Versicolor,Versicolor
53,5.5,2.3,4.0,1.3,Versicolor,Versicolor
54,6.5,2.8,4.6,1.5,Versicolor,Versicolor


In [None]:
iris.iloc[95:105]

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,Species,new
95,5.7,3.0,4.2,1.2,Versicolor,Versicolor
96,5.7,2.9,4.2,1.3,Versicolor,Versicolor
97,6.2,2.9,4.3,1.3,Versicolor,Versicolor
98,5.1,2.5,3.0,1.1,Versicolor,Versicolor
99,5.7,2.8,4.1,1.3,Versicolor,Versicolor
100,6.3,3.3,6.0,2.5,Virginica,Virginica
101,5.8,2.7,5.1,1.9,Virginica,Virginica
102,7.1,3.0,5.9,2.1,Virginica,Virginica
103,6.3,2.9,5.6,1.8,Virginica,Virginica
104,6.5,3.0,5.8,2.2,Virginica,Virginica


In [None]:
iris['new2'] = iris['new'].map(lambda x: 2 if x=='Virginica' else x)
iris.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,Species,new,new2
0,5.1,3.5,1.4,0.2,Setosa,0,0
1,4.9,3.0,1.4,0.2,Setosa,0,0
2,4.7,3.2,1.3,0.2,Setosa,0,0
3,4.6,3.1,1.5,0.2,Setosa,0,0
4,5.0,3.6,1.4,0.2,Setosa,0,0


In [None]:
iris.tail()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,Species,new,new2
145,6.7,3.0,5.2,2.3,Virginica,Virginica,2
146,6.3,2.5,5.0,1.9,Virginica,Virginica,2
147,6.5,3.0,5.2,2.0,Virginica,Virginica,2
148,6.2,3.4,5.4,2.3,Virginica,Virginica,2
149,5.9,3.0,5.1,1.8,Virginica,Virginica,2


In [None]:
iris['new'] = iris['sepal.length'].map(lambda x: x+1)
iris.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,Species,new,new2
0,5.1,3.5,1.4,0.2,Setosa,6.1,0
1,4.9,3.0,1.4,0.2,Setosa,5.9,0
2,4.7,3.2,1.3,0.2,Setosa,5.7,0
3,4.6,3.1,1.5,0.2,Setosa,5.6,0
4,5.0,3.6,1.4,0.2,Setosa,6.0,0


In [None]:
iris['new3'] = iris['sepal.length'].map(lambda x: str(x)+ '1')
iris.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,Species,new,new2,new3
0,5.1,3.5,1.4,0.2,Setosa,6.1,0,5.11
1,4.9,3.0,1.4,0.2,Setosa,5.9,0,4.91
2,4.7,3.2,1.3,0.2,Setosa,5.7,0,4.71
3,4.6,3.1,1.5,0.2,Setosa,5.6,0,4.61
4,5.0,3.6,1.4,0.2,Setosa,6.0,0,5.01


In [None]:
iris['species_2'] = iris['Species'].map(lambda x: 1 if x =='Setosa' else 2)
iris.head(100)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,Species,new,new2,new3,species_2
0,5.1,3.5,1.4,0.2,Setosa,6.1,0,5.11,1
1,4.9,3.0,1.4,0.2,Setosa,5.9,0,4.91,1
2,4.7,3.2,1.3,0.2,Setosa,5.7,0,4.71,1
3,4.6,3.1,1.5,0.2,Setosa,5.6,0,4.61,1
4,5.0,3.6,1.4,0.2,Setosa,6.0,0,5.01,1
...,...,...,...,...,...,...,...,...,...
95,5.7,3.0,4.2,1.2,Versicolor,6.7,Versicolor,5.71,2
96,5.7,2.9,4.2,1.3,Versicolor,6.7,Versicolor,5.71,2
97,6.2,2.9,4.3,1.3,Versicolor,7.2,Versicolor,6.21,2
98,5.1,2.5,3.0,1.1,Versicolor,6.1,Versicolor,5.11,2


In [None]:
iris.loc[0:2] #iris.iloc[:3,]; iris.iloc[0:3,0:];

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,Species,new,new2,new3,species_2
0,5.1,3.5,1.4,0.2,Setosa,6.1,0,5.11,1
1,4.9,3.0,1.4,0.2,Setosa,5.9,0,4.91,1
2,4.7,3.2,1.3,0.2,Setosa,5.7,0,4.71,1


In [None]:
iris_samp = iris.iloc[:,1:6]
iris_samp

Unnamed: 0,sepal.width,petal.length,petal.width,Species,new
0,3.5,1.4,0.2,Setosa,6.1
1,3.0,1.4,0.2,Setosa,5.9
2,3.2,1.3,0.2,Setosa,5.7
3,3.1,1.5,0.2,Setosa,5.6
4,3.6,1.4,0.2,Setosa,6.0
...,...,...,...,...,...
145,3.0,5.2,2.3,Virginica,7.7
146,2.5,5.0,1.9,Virginica,7.3
147,3.0,5.2,2.0,Virginica,7.5
148,3.4,5.4,2.3,Virginica,7.2


In [None]:
iris_samp_grp = iris.groupby('Species')
iris_samp_grp # creates a name and the corresponding group

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7827b926ae90>

In [None]:
iris.groupby('Species')['petal.length'].mean()

Species
Setosa        1.462
Versicolor    4.260
Virginica     5.552
Name: petal.length, dtype: float64

In [None]:
iris.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,Species,new,new2,new3,species_2
0,5.1,3.5,1.4,0.2,Setosa,6.1,0,5.11,1
1,4.9,3.0,1.4,0.2,Setosa,5.9,0,4.91,1
2,4.7,3.2,1.3,0.2,Setosa,5.7,0,4.71,1
3,4.6,3.1,1.5,0.2,Setosa,5.6,0,4.61,1
4,5.0,3.6,1.4,0.2,Setosa,6.0,0,5.01,1


In [None]:
#iris.head()
iris.drop(['species_2','new','new2','new3'],axis=1, inplace=True)
iris.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,Species
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


In [None]:
for name,group in iris.groupby('Species'):
    print (name)
    print(group)

Setosa
    sepal.length  sepal.width  petal.length  petal.width Species
0            5.1          3.5           1.4          0.2  Setosa
1            4.9          3.0           1.4          0.2  Setosa
2            4.7          3.2           1.3          0.2  Setosa
3            4.6          3.1           1.5          0.2  Setosa
4            5.0          3.6           1.4          0.2  Setosa
5            5.4          3.9           1.7          0.4  Setosa
6            4.6          3.4           1.4          0.3  Setosa
7            5.0          3.4           1.5          0.2  Setosa
8            4.4          2.9           1.4          0.2  Setosa
9            4.9          3.1           1.5          0.1  Setosa
10           5.4          3.7           1.5          0.2  Setosa
11           4.8          3.4           1.6          0.2  Setosa
12           4.8          3.0           1.4          0.1  Setosa
13           4.3          3.0           1.1          0.1  Setosa
14           5.8  

In [None]:
iris_samp_grp.size()

Species
Setosa        50
Versicolor    50
Virginica     50
dtype: int64

In [None]:
iris_samp_grp.mean() # group-wise

Unnamed: 0_level_0,sepal.length,sepal.width,petal.length,petal.width
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Setosa,5.006,3.428,1.462,0.246
Versicolor,5.936,2.77,4.26,1.326
Virginica,6.588,2.974,5.552,2.026


In [None]:
iris_samp_grp['sepal.length'].mean()

Species
Setosa        5.006
Versicolor    5.936
Virginica     6.588
Name: sepal.length, dtype: float64

In [None]:
#iris.groupby('Species').sum().sort_values(ascending = False)
#iris.groupby('Species').sum()['SepalLengthCm'].sort_values(ascending = False)
#iris.groupby('Species').agg({'SepalLengthCm': 'sum', 'SepalWidthCm': 'mean'})
#iris['Species'].str.split('-', expand=True).head()
#iris['Species'].str.split('-', expand=True).stack().head()
iris.groupby('Species').sum()#.['sepal.length'].count()

Unnamed: 0_level_0,sepal.length,sepal.width,petal.length,petal.width
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Setosa,250.3,171.4,73.1,12.3
Versicolor,296.8,138.5,213.0,66.3
Virginica,329.4,148.7,277.6,101.3


In [None]:
# Concatenation & Joins

In [None]:
iris_samp_1 = iris[iris['Species']=='Setosa']
iris_samp_2 = iris[iris['Species']=='Virginica']

In [None]:
iris_samp_1

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,Species
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
5,5.4,3.9,1.7,0.4,Setosa
6,4.6,3.4,1.4,0.3,Setosa
7,5.0,3.4,1.5,0.2,Setosa
8,4.4,2.9,1.4,0.2,Setosa
9,4.9,3.1,1.5,0.1,Setosa


In [None]:
iris_samp_2

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,Species
100,6.3,3.3,6.0,2.5,Virginica
101,5.8,2.7,5.1,1.9,Virginica
102,7.1,3.0,5.9,2.1,Virginica
103,6.3,2.9,5.6,1.8,Virginica
104,6.5,3.0,5.8,2.2,Virginica
105,7.6,3.0,6.6,2.1,Virginica
106,4.9,2.5,4.5,1.7,Virginica
107,7.3,2.9,6.3,1.8,Virginica
108,6.7,2.5,5.8,1.8,Virginica
109,7.2,3.6,6.1,2.5,Virginica


In [None]:
iris_samp_concat = pd.concat([iris_samp_1, iris_samp_2], axis=0)
iris_samp_concat

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,Species
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica
