## Import Library

In [1]:
import numpy as np
import pandas as pd

### Series and Dataframe

In [2]:
pd.Series?

In [3]:
pd.DataFrame?

In [4]:
my_pets = ['Lion', 'Cat', 'Birds', 'Fish']
my_pets

['Lion', 'Cat', 'Birds', 'Fish']

In [6]:
pd.Series(my_pets) # Converting our list to Series with indexes

0     Lion
1      Cat
2    Birds
3     Fish
dtype: object

### We can create our own index overiding default index(0,1,2...)

In [11]:
my_index = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']

In [12]:
my_courses = ['Economics', 'Geography', 'Finance', 'Mathematics', 'History']

In [13]:
pd.Series(my_courses)

0      Economics
1      Geography
2        Finance
3    Mathematics
4        History
dtype: object

In [14]:
pd.Series(my_courses, index=my_index)

Monday         Economics
Tuesday        Geography
Wednesday        Finance
Thursday     Mathematics
Friday           History
dtype: object

In [16]:
days = pd.Series(['Mon', 'Tues', 'Wed', 'Thurs', 'Fri'], index=['Day1', 'Day2', 'Day3', 'Day4', 'Day5'])
days

Day1      Mon
Day2     Tues
Day3      Wed
Day4    Thurs
Day5      Fri
dtype: object

In [17]:
courses = pd.Series(['Eco', 'Geo', 'Fin', 'Maths', 'Hist'], index='Day1 Day2 Day3 Day4 Day5'.split())
courses

Day1      Eco
Day2      Geo
Day3      Fin
Day4    Maths
Day5     Hist
dtype: object

In [18]:
courses = pd.Series(['Eco', 'Geo', 'Fin', 'Maths', 'Hist'], index='Day1, Day2, Day3, Day4, Day5'.split(', '))
courses

Day1      Eco
Day2      Geo
Day3      Fin
Day4    Maths
Day5     Hist
dtype: object

### Series concatenation based on index

In [21]:
days + ' ' + courses

Day1        Mon Eco
Day2       Tues Geo
Day3        Wed Fin
Day4    Thurs Maths
Day5       Fri Hist
dtype: object

### Accessing Series Data using index

In [24]:
courses['Day5']

'Hist'

In [26]:
days['Day3']

'Wed'

### loc & iloc

In [27]:
# Creating Series using dictionary
sports = {'Football':'Spain',
         'NBA':'USA',
         'Cricket':'India',
         'Atheltes':'Jamaica'}
sports_series = pd.Series(sports)
sports_series

Football      Spain
NBA             USA
Cricket       India
Atheltes    Jamaica
dtype: object

In [28]:
sports_series.loc['Cricket']

'India'

In [29]:
sports_series.iloc[2]

'India'

In [30]:
sports_series['Cricket']

'India'

In [31]:
s2 = pd.Series([100, 200, 300], index=[10, 20, 30])
s2

10    100
20    200
30    300
dtype: int64

In [32]:
s2[10]

np.int64(100)

In [34]:
s2.loc[10]

np.int64(100)

## DataFrame

In [39]:
random_normal_distribution_array = np.random.randn(10,5)
random_normal_distribution_array

array([[ 2.40854529,  0.7996967 , -0.74902621,  0.51813727,  1.46835487],
       [-0.35925925,  1.15094803, -1.30450681,  0.74712775,  1.48693624],
       [-0.16052604, -1.35923219,  0.58424106, -0.19892893, -0.45715536],
       [ 1.0932033 ,  0.66827395, -0.46620326,  0.60866284,  0.55931795],
       [-0.38889314,  0.20587444, -0.52483238, -0.4384877 ,  0.416256  ],
       [-2.1516762 , -0.06875414, -1.13472042, -0.53207631, -1.86565347],
       [-0.11076466, -0.19397704, -0.17091104, -0.4307133 , -1.09577742],
       [ 0.40522203, -0.64698753, -1.50337432, -0.48531232,  1.42004319],
       [ 0.84064936, -0.06458189, -0.02369967, -1.16054233,  0.76716156],
       [ 0.2863776 ,  1.75550109,  0.42733618,  1.93558518, -0.14988151]])

In [37]:
np.min(random_normal_distribution_array)

np.float64(-2.5202229973575747)

In [40]:
np.max(random_normal_distribution_array)

np.float64(2.4085452910062757)

In [41]:
my_dataframe = pd.DataFrame(np.random.randn(10,5), index='row1, row2, row3, row4, row5, row6, row7, row8, row9, row10'.split(', '),
                            columns='column1, column2, column3, column4, column5'.split(', '))
my_dataframe

Unnamed: 0,column1,column2,column3,column4,column5
row1,-0.459777,-0.837914,0.124778,-0.948637,0.741644
row2,0.849825,-0.163339,-0.283322,0.935878,0.35992
row3,0.059858,1.064744,1.24122,0.4007,1.925026
row4,0.685403,0.226283,-1.841958,-0.278643,1.741308
row5,-1.563659,0.661184,-0.824104,-0.494983,0.210169
row6,1.324894,-0.570205,1.238114,1.47731,0.136338
row7,-0.589005,-2.612278,-0.008682,-0.361822,-0.013318
row8,-0.924938,0.150113,0.742846,-0.172087,1.437708
row9,-1.325651,0.454818,-0.361992,-1.058341,-0.953292
row10,-2.229473,0.100977,1.1473,2.491926,0.416009


In [42]:
type(my_dataframe)

pandas.core.frame.DataFrame

In [43]:
type(my_dataframe['column1'])

pandas.core.series.Series

In [46]:
# Selecting particular column/series
my_dataframe['column1']

row1    -0.459777
row2     0.849825
row3     0.059858
row4     0.685403
row5    -1.563659
row6     1.324894
row7    -0.589005
row8    -0.924938
row9    -1.325651
row10   -2.229473
Name: column1, dtype: float64

### Selection and Indexing

In [49]:
# Selecting multiple columns at time from DataFrame
my_dataframe[['column1', 'column3', 'column5']]

Unnamed: 0,column1,column3,column5
row1,-0.459777,0.124778,0.741644
row2,0.849825,-0.283322,0.35992
row3,0.059858,1.24122,1.925026
row4,0.685403,-1.841958,1.741308
row5,-1.563659,-0.824104,0.210169
row6,1.324894,1.238114,0.136338
row7,-0.589005,-0.008682,-0.013318
row8,-0.924938,0.742846,1.437708
row9,-1.325651,-0.361992,-0.953292
row10,-2.229473,1.1473,0.416009


### Let's add new column to our DataFrame

In [69]:
my_dataframe['column6'] = my_dataframe['column1']*2
my_dataframe

Unnamed: 0,column1,column2,column3,column4,column5,column6
row1,-0.459777,-0.837914,0.124778,-0.948637,0.741644,-0.919553
row2,0.849825,-0.163339,-0.283322,0.935878,0.35992,1.699651
row3,0.059858,1.064744,1.24122,0.4007,1.925026,0.119715
row4,0.685403,0.226283,-1.841958,-0.278643,1.741308,1.370806
row5,-1.563659,0.661184,-0.824104,-0.494983,0.210169,-3.127318
row6,1.324894,-0.570205,1.238114,1.47731,0.136338,2.649789
row7,-0.589005,-2.612278,-0.008682,-0.361822,-0.013318,-1.17801
row8,-0.924938,0.150113,0.742846,-0.172087,1.437708,-1.849877
row9,-1.325651,0.454818,-0.361992,-1.058341,-0.953292,-2.651302
row10,-2.229473,0.100977,1.1473,2.491926,0.416009,-4.458946


### Drop column from our DataFrame

In [73]:
my_dataframe.drop('column4', axis=1) # axis=1 refer to column

Unnamed: 0,column1,column2,column3,column5,column6
row1,-0.459777,-0.837914,0.124778,0.741644,-0.919553
row2,0.849825,-0.163339,-0.283322,0.35992,1.699651
row3,0.059858,1.064744,1.24122,1.925026,0.119715
row4,0.685403,0.226283,-1.841958,1.741308,1.370806
row5,-1.563659,0.661184,-0.824104,0.210169,-3.127318
row6,1.324894,-0.570205,1.238114,0.136338,2.649789
row7,-0.589005,-2.612278,-0.008682,-0.013318,-1.17801
row8,-0.924938,0.150113,0.742846,1.437708,-1.849877
row9,-1.325651,0.454818,-0.361992,-0.953292,-2.651302
row10,-2.229473,0.100977,1.1473,0.416009,-4.458946


In [75]:
my_dataframe # above dropping of column4 will not affect original dataframe

Unnamed: 0,column1,column2,column3,column4,column5,column6
row1,-0.459777,-0.837914,0.124778,-0.948637,0.741644,-0.919553
row2,0.849825,-0.163339,-0.283322,0.935878,0.35992,1.699651
row3,0.059858,1.064744,1.24122,0.4007,1.925026,0.119715
row4,0.685403,0.226283,-1.841958,-0.278643,1.741308,1.370806
row5,-1.563659,0.661184,-0.824104,-0.494983,0.210169,-3.127318
row6,1.324894,-0.570205,1.238114,1.47731,0.136338,2.649789
row7,-0.589005,-2.612278,-0.008682,-0.361822,-0.013318,-1.17801
row8,-0.924938,0.150113,0.742846,-0.172087,1.437708,-1.849877
row9,-1.325651,0.454818,-0.361992,-1.058341,-0.953292,-2.651302
row10,-2.229473,0.100977,1.1473,2.491926,0.416009,-4.458946


In [76]:
my_dataframe.drop('column4', axis=1, inplace=True)
my_dataframe

Unnamed: 0,column1,column2,column3,column5,column6
row1,-0.459777,-0.837914,0.124778,0.741644,-0.919553
row2,0.849825,-0.163339,-0.283322,0.35992,1.699651
row3,0.059858,1.064744,1.24122,1.925026,0.119715
row4,0.685403,0.226283,-1.841958,1.741308,1.370806
row5,-1.563659,0.661184,-0.824104,0.210169,-3.127318
row6,1.324894,-0.570205,1.238114,0.136338,2.649789
row7,-0.589005,-2.612278,-0.008682,-0.013318,-1.17801
row8,-0.924938,0.150113,0.742846,1.437708,-1.849877
row9,-1.325651,0.454818,-0.361992,-0.953292,-2.651302
row10,-2.229473,0.100977,1.1473,0.416009,-4.458946


In [77]:
# Dropping row from DataFrame
my_dataframe.drop('row2', axis=0, inplace=True)
my_dataframe

Unnamed: 0,column1,column2,column3,column5,column6
row1,-0.459777,-0.837914,0.124778,0.741644,-0.919553
row3,0.059858,1.064744,1.24122,1.925026,0.119715
row4,0.685403,0.226283,-1.841958,1.741308,1.370806
row5,-1.563659,0.661184,-0.824104,0.210169,-3.127318
row6,1.324894,-0.570205,1.238114,0.136338,2.649789
row7,-0.589005,-2.612278,-0.008682,-0.013318,-1.17801
row8,-0.924938,0.150113,0.742846,1.437708,-1.849877
row9,-1.325651,0.454818,-0.361992,-0.953292,-2.651302
row10,-2.229473,0.100977,1.1473,0.416009,-4.458946


In [78]:
# accessing data element using iloc e.g 2nd row with index location '1'
my_dataframe.iloc[1]

column1    0.059858
column2    1.064744
column3    1.241220
column5    1.925026
column6    0.119715
Name: row3, dtype: float64

In [79]:
type(my_dataframe.iloc[1])

pandas.core.series.Series

In [80]:
my_dataframe.loc['row3', 'column2']

np.float64(1.0647444893464781)

### Reading a Dataset with Pandas

In [65]:
raw_file_path = '/Users/vishalsaxena/Documents/DS_Workspace/ds_bootcamp/data/raw/Automobile.csv'
data = pd.read_csv(raw_file_path)
data.head(15)

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,number_of_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,168,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,168,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,168,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450
5,2,161,audi,gas,std,two,sedan,fwd,front,99.8,...,136,mpfi,3.19,3.4,8.5,110,5500,19,25,15250
6,1,158,audi,gas,std,four,sedan,fwd,front,105.8,...,136,mpfi,3.19,3.4,8.5,110,5500,19,25,17710
7,1,168,audi,gas,std,four,wagon,fwd,front,105.8,...,136,mpfi,3.19,3.4,8.5,110,5500,19,25,18920
8,1,158,audi,gas,turbo,four,sedan,fwd,front,105.8,...,131,mpfi,3.13,3.4,8.3,140,5500,17,20,23875
9,2,192,bmw,gas,std,two,sedan,rwd,front,101.2,...,108,mpfi,3.5,2.8,8.8,101,5800,23,29,16430


In [66]:
data.tail()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,number_of_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
196,-1,95,volvo,gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114,5400,23,28,16845
197,-1,95,volvo,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160,5300,19,25,19045
198,-1,95,volvo,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485
199,-1,95,volvo,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.4,23.0,106,4800,26,27,22470
200,-1,95,volvo,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114,5400,19,25,22625
