# Pandas

- Data representation is better than numpy
- It can store hetrogenous data

In [6]:
import pandas as pd

# We have 2 data structures

- Series - 1D
- Dataframe - 2D

# Series

In [7]:
# Create an series data frame using list


rivers = ['Ganga' , 'Yamuna' , 'Kaveri' , 'Krishna' , 'Godavari']

In [9]:
ser_river = pd.Series(rivers)
ser_river

0       Ganga
1      Yamuna
2      Kaveri
3     Krishna
4    Godavari
dtype: object

In [14]:
ser_river = pd.Series(rivers , index = ['a' , 'b' , 'c' , 'd' , 'e'])
ser_river

a       Ganga
b      Yamuna
c      Kaveri
d     Krishna
e    Godavari
dtype: object

In [16]:
ser_river = pd.Series(rivers , index = range(100 , 105))
ser_river

100       Ganga
101      Yamuna
102      Kaveri
103     Krishna
104    Godavari
dtype: object

In [17]:
import numpy as np

In [19]:
# Using an array

random_ser = pd.Series(np.random.randn(5) , index = ['a' , 'b' , 'c' , 'd' , 'e'] )
random_ser

a   -1.444856
b   -0.394515
c    0.920313
d    0.855907
e    0.755883
dtype: float64

In [20]:
# Using an array

random_ser = pd.Series(np.random.randn(5) , index = ['a' , 'b' , 'c' , 'd'] )
random_ser

ValueError: Length of values (5) does not match length of index (4)

In [21]:
dic = {'a' :1 , 'b':2 , 'c':4 , 'd':5}


ser_data = pd.Series(dic)
ser_data

a    1
b    2
c    4
d    5
dtype: int64

In [23]:
dic = {'a' :1 , 'b':2 , 'c':4 , 'd':5}


ser_data = pd.Series(dic , index = ['a' , 'b'])
ser_data

a    1
b    2
dtype: int64

In [24]:
dic = {'a' :1 , 'b':2 , 'c':4 , 'd':5}


ser_data = pd.Series(dic , index = ['a' , 'b' , 'z'])
ser_data

a    1.0
b    2.0
z    NaN
dtype: float64

# Indexing and slicing

It can be done is 2 ways

- user defined index value 
- By normal index values (0 - len)

In [25]:
ser_river = pd.Series(rivers , index = ['a' , 'b' , 'c' , 'd' , 'e'])
ser_river

a       Ganga
b      Yamuna
c      Kaveri
d     Krishna
e    Godavari
dtype: object

In [26]:
ser_river['c']

'Kaveri'

In [27]:
ser_river[2]

'Kaveri'

# Slicing

In [28]:
ser_river

a       Ganga
b      Yamuna
c      Kaveri
d     Krishna
e    Godavari
dtype: object

In [30]:
ser_river[1:4]

# start value ---> inclusive
# stop value ---> exclusive

b     Yamuna
c     Kaveri
d    Krishna
dtype: object

In [31]:
# User defined

ser_river['b':'e']

# start value ---> inclusive
# stop value ---> inclusive

b      Yamuna
c      Kaveri
d     Krishna
e    Godavari
dtype: object

In [32]:
ser_river = pd.Series(rivers)
ser_river


0       Ganga
1      Yamuna
2      Kaveri
3     Krishna
4    Godavari
dtype: object

In [33]:
ser_river[1:4]

1     Yamuna
2     Kaveri
3    Krishna
dtype: object

In [34]:
brics_country = ['Brazil', 'Russia', 'India', 'China', 'South Africa']

brics_currency = ['Real', 'Ruble', 'Rupee', 'Renminbi', 'Rand' ]

In [36]:
data = pd.Series(brics_country , index = brics_currency)
data

Real              Brazil
Ruble             Russia
Rupee              India
Renminbi           China
Rand        South Africa
dtype: object

In [38]:
data['Ruble':'Renminbi']

Ruble       Russia
Rupee        India
Renminbi     China
dtype: object

In [41]:
data.index = ['a' , 'b' , 'c' , 'd' , 'e']

In [42]:
data

a          Brazil
b          Russia
c           India
d           China
e    South Africa
dtype: object

In [44]:
data[0] = 'Japan'

In [45]:
data

a           Japan
b          Russia
c           India
d           China
e    South Africa
dtype: object

# 2D --> Dataframes

In [None]:
brics_country = ['Brazil', 'Russia', 'India', 'China', 'South Africa']

brics_currency = ['Real', 'Ruble', 'Rupee', 'Renminbi', 'Rand' ]

In [46]:
# Create a data frame from a list

df = pd.DataFrame(brics_country)
df

Unnamed: 0,0
0,Brazil
1,Russia
2,India
3,China
4,South Africa


In [48]:
df = pd.DataFrame(brics_country , columns = ['Country'])
df

Unnamed: 0,Country
0,Brazil
1,Russia
2,India
3,China
4,South Africa


In [None]:
brics_country = ['Brazil', 'Russia', 'India', 'China', 'South Africa']

brics_currency = ['Real', 'Ruble', 'Rupee', 'Renminbi', 'Rand' ]

In [52]:
df = pd.DataFrame([brics_country , brics_currency])
df

Unnamed: 0,0,1,2,3,4
0,Brazil,Russia,India,China,South Africa
1,Real,Ruble,Rupee,Renminbi,Rand


In [54]:
brics_country = ['Brazil', 'Russia', 'India', 'China', 'South Africa']

brics_currency = ['Real', 'Ruble', 'Rupee', 'Renminbi', 'Rand' ]

df = pd.DataFrame({'Col1':brics_country ,
                  'Col2' : brics_currency})
df

Unnamed: 0,Col1,Col2
0,Brazil,Real
1,Russia,Ruble
2,India,Rupee
3,China,Renminbi
4,South Africa,Rand


In [56]:
brics_country = ['Brazil', 'Russia', 'India', 'China', 'South Africa']

brics_currency = ['Real', 'Ruble', 'Rupee', 'Renminbi',np.nan ]

df = pd.DataFrame({'Col1':brics_country ,
                  'Col2' : brics_currency})
df

Unnamed: 0,Col1,Col2
0,Brazil,Real
1,Russia,Ruble
2,India,Rupee
3,China,Renminbi
4,South Africa,


In [62]:
brics_country = ['Brazil', 'Russia', 'India', 'China', 'South Africa']

brics_currency = ['Real', 'Ruble', 'Rupee',np.nan ,'Rand' ]

df = pd.DataFrame({'Col1': pd.Series(brics_country) ,
                  'Col2' : pd.Series(brics_currency)})
df

Unnamed: 0,Col1,Col2
0,Brazil,Real
1,Russia,Ruble
2,India,Rupee
3,China,
4,South Africa,Rand


In [64]:
# Import datafram

#csv

df_csv = pd.read_csv('datasets/ted_data.csv')
df_csv

Unnamed: 0,name_speaker,speaker_occupation,title,views,comments
0,Ken Robinson,Author/educator,Do schools kill creativity?,47227110,4553
1,Al Gore,Climate advocate,Averting the climate crisis,3200520,265
2,David Pogue,Technology columnist,Simplicity sells,1636292,124
3,Majora Carter,Activist for environmental justice,Greening the ghetto,1697550,200
4,Hans Rosling,Global health expert; data visionary,The best stats you've ever seen,12005869,593
5,Tony Robbins,Life coach; expert in leadership psychology,Why we do what we do,20685401,672
6,Julia Sweeney,"Actor, comedian, playwright",Letting go of God,3769987,919
7,Joshua Prince-Ramus,Architect,Behind the design of Seattle's library,967741,46
8,Dan Dennett,"Philosopher, cognitive scientist",Let's teach religion -- all religion -- in sch...,2567958,582
9,Rick Warren,"Pastor, author",A life of purpose,3095993,900


In [13]:

df_excel = pd.read_excel('C://Users//MIT//Desktop/datasets/automobile_camera.xlsx')
df_excel

Unnamed: 0.1,Unnamed: 0,car_name,mpg,horsepower,origin_country
0,0,Toyota Corolla Mark ii,24,95,Japan
1,1,Chevrolet Chevelle Malibu,18,130,US
2,2,Audi 100 LS,24,90,Germany
3,3,BMW 2002,26,113,Germany
4,4,Datsun PL510,27,88,Japan


In [None]:
# 2. Create a data frame using a dic

In [15]:
df_dict = {'Year' : [1990, 1994, 1998, 2002],
           'Country' : ['Italy', 'USA', 'France', 'Japan'],
           'Winner' : ['Germany', 'Brazil', 'France', 'Brazil'],
           'GoalScored' : [115, 141, 171, 161]
          }

data = pd.DataFrame(df_dict)
data

Unnamed: 0,Year,Country,Winner,GoalScored
0,1990,Italy,Germany,115
1,1994,USA,Brazil,141
2,1998,France,France,171
3,2002,Japan,Brazil,161


In [17]:
df_dict = {'Year' : pd.Series([1990, 1994, 1998]),
           'Country' : pd.Series(['Italy', 'USA', 'France', 'Japan']),
           'Winner' : pd.Series(['Germany', 'Brazil', 'France', 'Brazil']),
           'GoalScored' : pd.Series([115, 141, 171, 161])
          }

data = pd.DataFrame(df_dict)
data

Unnamed: 0,Year,Country,Winner,GoalScored
0,1990.0,Italy,Germany,115
1,1994.0,USA,Brazil,141
2,1998.0,France,France,171
3,,Japan,Brazil,161


In [25]:
df_lotuples = [(2002, 'Japan', 'Brazil', 161), 
                (2006, 'Germany', 'Italy', 147), 
                (2010, 'South Africa', 'Spain', 145),
                (2014, 'Brazil', 'Germany', 171)
              ]

df = pd.DataFrame(df_lotuples)
df

Unnamed: 0,0,1,2,3
0,2002,Japan,Brazil,161
1,2006,Germany,Italy,147
2,2010,South Africa,Spain,145
3,2014,Brazil,Germany,171


In [31]:
df_lotuples = [(2002, 'Japan', 'Brazil', 161), 
                (2006, 'Germany', 'Italy', 147), 
                (2010, 'South Africa', 'Spain', 145),
                (2014, 'Brazil', 'Germany', 171)
              ]

df = pd.DataFrame(df_lotuples , columns= ['Year' , 'Country' , 'winner' , 'score'])
df

Unnamed: 0,Year,Country,winner,score
0,2002,Japan,Brazil,161
1,2006,Germany,Italy,147
2,2010,South Africa,Spain,145
3,2014,Brazil,Germany,171


In [None]:
# 

In [36]:
df_lotuples = [(2002, 'Japan', 'Brazil', 161), 
                (2006, 'Germany', 'Italy', 147), 
                (2010, 'South Africa', 'Spain', 145),
                (2014, 'Brazil', 'Germany', 171)
              ]

df = pd.DataFrame(df_lotuples)
df

Unnamed: 0,0,1,2,3
0,2002,Japan,Brazil,161
1,2006,Germany,Italy,147
2,2010,South Africa,Spain,145
3,2014,Brazil,Germany,171


In [37]:
df = df.T

In [39]:
df.index = ['Year' , 'Country' , 'winner' , 'score']

In [41]:
df = df.T

In [42]:
df

Unnamed: 0,Year,Country,winner,score
0,2002,Japan,Brazil,161
1,2006,Germany,Italy,147
2,2010,South Africa,Spain,145
3,2014,Brazil,Germany,171


In [44]:
df_lodict = [
             {'year' : 2002, 'HostCountry' : 'Japan', 'Winner' : 'Brazil'},
             {'year' : 2006, 'HostCountry' : 'Germany', 'Winner' : 'Italy'},
             {'year' : 2010, 'HostCountry' : 'South Africa', 'Winner' : 'Spain'},
             {'year' : 2014, 'HostCountry' : 'Brazil', 'Winner' : 'Germany'},
            ]

df = pd.DataFrame(df_lodict)
df

Unnamed: 0,year,HostCountry,Winner
0,2002,Japan,Brazil
1,2006,Germany,Italy
2,2010,South Africa,Spain
3,2014,Brazil,Germany


In [8]:
blks = [
    { "gym": False, "school": True, "store": False, },
    {"store": False },
    { "gym": True, "school": True, "store": False, },
    { "gym": False, "school": True, "store": False, },
    { "gym": False, "store": True, } ]

df = pd.DataFrame(blks)
df

Unnamed: 0,gym,school,store
0,False,True,False
1,,,False
2,True,True,False
3,False,True,False
4,False,,True


In [20]:
df_lotuples = ([2002, 'Brazil',161], 
                (2006, 'Germany', 'Italy', 147), 
                (2010, 'South Africa', 'Spain', 145),
                (2014, 'Brazil', 'Germany', 171))
              

df = pd.DataFrame(df_lotuples)
df

Unnamed: 0,0,1,2,3
0,2002,Brazil,161,
1,2006,Germany,Italy,147.0
2,2010,South Africa,Spain,145.0
3,2014,Brazil,Germany,171.0


# Pandas dataframe basics

In [22]:
df = pd.read_csv('datasets/weather_data.csv')
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,35,7,Sunny
2,1/3/2017,28,2,Snow
3,1/4/2017,24,7,Snow
4,1/5/2017,32,4,Rain
5,1/6/2017,31,2,Sunny


In [23]:
# shape

df.shape

(6, 4)

In [58]:
# head() --->

df = pd.read_csv('datasets/citibike_tripdata.csv')
df

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,end station id,end station name,bikeid,name_localizedValue,usertype
0,338,2018-05-01 00:04:47,2018-05-01 00:10:25,3639,Harborside,3199,Newport Pkwy,33558,Annual Membership,Subscriber
1,1482,2018-05-01 01:31:10,2018-05-01 01:55:53,3681,Grand St,3185,City Hall,33593,24 Hour,Customer
2,232,2018-05-01 01:31:29,2018-05-01 01:35:22,3194,McGinley Square,3193,Lincoln Park,29217,FREE Bonus Month with Annual Membership,Subscriber
3,190,2018-05-01 02:03:29,2018-05-01 02:06:40,3185,City Hall,3186,Grove St PATH,29662,24 Hour,Customer
4,303,2018-05-01 04:27:12,2018-05-01 04:32:16,3207,Oakland Ave,3195,Sip Ave,15271,Annual Membership,Subscriber
...,...,...,...,...,...,...,...,...,...,...
32423,396,2018-05-30 13:49:18,2018-05-30 13:55:55,3184,Paulus Hook,3279,Dixon Mills,29639,Join Citi Bike for $14.95/month,Subscriber
32424,313,2018-05-30 13:49:21,2018-05-30 13:54:35,3202,Newport PATH,3639,Harborside,26301,$25 Off Annual Membership,Subscriber
32425,316,2018-05-30 13:49:51,2018-05-30 13:55:08,3220,5 Corners Library,3195,Sip Ave,29260,Annual Membership,Subscriber
32426,1130,2018-05-30 13:50:52,2018-05-30 14:09:42,3281,Leonard Gordon Park,3213,Van Vorst Park,26239,Annual Membership,Subscriber


In [25]:
# top 10 rows of data

df.head()

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,end station id,end station name,bikeid,name_localizedValue,usertype
0,338,2018-05-01 00:04:47,2018-05-01 00:10:25,3639,Harborside,3199,Newport Pkwy,33558,Annual Membership,Subscriber
1,1482,2018-05-01 01:31:10,2018-05-01 01:55:53,3681,Grand St,3185,City Hall,33593,24 Hour,Customer
2,232,2018-05-01 01:31:29,2018-05-01 01:35:22,3194,McGinley Square,3193,Lincoln Park,29217,FREE Bonus Month with Annual Membership,Subscriber
3,190,2018-05-01 02:03:29,2018-05-01 02:06:40,3185,City Hall,3186,Grove St PATH,29662,24 Hour,Customer
4,303,2018-05-01 04:27:12,2018-05-01 04:32:16,3207,Oakland Ave,3195,Sip Ave,15271,Annual Membership,Subscriber


In [26]:
df.head(10)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,end station id,end station name,bikeid,name_localizedValue,usertype
0,338,2018-05-01 00:04:47,2018-05-01 00:10:25,3639,Harborside,3199,Newport Pkwy,33558,Annual Membership,Subscriber
1,1482,2018-05-01 01:31:10,2018-05-01 01:55:53,3681,Grand St,3185,City Hall,33593,24 Hour,Customer
2,232,2018-05-01 01:31:29,2018-05-01 01:35:22,3194,McGinley Square,3193,Lincoln Park,29217,FREE Bonus Month with Annual Membership,Subscriber
3,190,2018-05-01 02:03:29,2018-05-01 02:06:40,3185,City Hall,3186,Grove St PATH,29662,24 Hour,Customer
4,303,2018-05-01 04:27:12,2018-05-01 04:32:16,3207,Oakland Ave,3195,Sip Ave,15271,Annual Membership,Subscriber
5,176,2018-05-01 04:37:05,2018-05-01 04:40:01,3194,McGinley Square,3195,Sip Ave,29298,Annual Membership,Subscriber
6,577,2018-05-01 05:05:46,2018-05-01 05:15:23,3225,Baldwin at Montgomery,3186,Grove St PATH,33619,Annual Membership from Citi Bike App,Subscriber
7,830,2018-05-01 05:11:50,2018-05-01 05:25:41,3207,Oakland Ave,3185,City Hall,33624,Annual Membership,Subscriber
8,395,2018-05-01 05:12:07,2018-05-01 05:18:42,3225,Baldwin at Montgomery,3186,Grove St PATH,26300,Annual Membership,Subscriber
9,170,2018-05-01 05:13:52,2018-05-01 05:16:43,3206,Hilltop,3195,Sip Ave,33555,$25 Off Annual Membership,Subscriber


In [27]:
df.tail()

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,end station id,end station name,bikeid,name_localizedValue,usertype
32423,396,2018-05-30 13:49:18,2018-05-30 13:55:55,3184,Paulus Hook,3279,Dixon Mills,29639,Join Citi Bike for $14.95/month,Subscriber
32424,313,2018-05-30 13:49:21,2018-05-30 13:54:35,3202,Newport PATH,3639,Harborside,26301,$25 Off Annual Membership,Subscriber
32425,316,2018-05-30 13:49:51,2018-05-30 13:55:08,3220,5 Corners Library,3195,Sip Ave,29260,Annual Membership,Subscriber
32426,1130,2018-05-30 13:50:52,2018-05-30 14:09:42,3281,Leonard Gordon Park,3213,Van Vorst Park,26239,Annual Membership,Subscriber
32427,369,2018-05-30 13:50:58,2018-05-30 13:57:08,3220,5 Corners Library,3207,Oakland Ave,33660,Annual Membership,Subscriber


In [28]:
df.tail(10)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,end station id,end station name,bikeid,name_localizedValue,usertype
32418,340,2018-05-30 13:42:45,2018-05-30 13:48:26,3203,Hamilton Park,3199,Newport Pkwy,33652,Annual Membership,Subscriber
32419,307,2018-05-30 13:44:09,2018-05-30 13:49:16,3187,Warren St,3202,Newport PATH,33558,Annual Membership,Subscriber
32420,485,2018-05-30 13:44:46,2018-05-30 13:52:51,3183,Exchange Place,3211,Newark Ave,29534,Annual Membership - Save 15%,Subscriber
32421,255,2018-05-30 13:45:38,2018-05-30 13:49:54,3183,Exchange Place,3213,Van Vorst Park,29443,Annual Membership,Subscriber
32422,108,2018-05-30 13:49:12,2018-05-30 13:51:00,3202,Newport PATH,3199,Newport Pkwy,29461,Annual Membership,Subscriber
32423,396,2018-05-30 13:49:18,2018-05-30 13:55:55,3184,Paulus Hook,3279,Dixon Mills,29639,Join Citi Bike for $14.95/month,Subscriber
32424,313,2018-05-30 13:49:21,2018-05-30 13:54:35,3202,Newport PATH,3639,Harborside,26301,$25 Off Annual Membership,Subscriber
32425,316,2018-05-30 13:49:51,2018-05-30 13:55:08,3220,5 Corners Library,3195,Sip Ave,29260,Annual Membership,Subscriber
32426,1130,2018-05-30 13:50:52,2018-05-30 14:09:42,3281,Leonard Gordon Park,3213,Van Vorst Park,26239,Annual Membership,Subscriber
32427,369,2018-05-30 13:50:58,2018-05-30 13:57:08,3220,5 Corners Library,3207,Oakland Ave,33660,Annual Membership,Subscriber


In [30]:
# index 

df.index

RangeIndex(start=0, stop=32428, step=1)

In [32]:
df.columns

Index(['tripduration', 'starttime', 'stoptime', 'start station id',
       'start station name', 'end station id', 'end station name', 'bikeid',
       'name_localizedValue', 'usertype'],
      dtype='object')

In [55]:
df = pd.read_csv('datasets/weather_data.csv')
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,35,7,Sunny
2,1/3/2017,28,2,Snow
3,1/4/2017,24,7,Snow
4,1/5/2017,32,4,Rain
5,1/6/2017,31,2,Sunny


In [34]:
# info()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   day          6 non-null      object
 1   temperature  6 non-null      int64 
 2   windspeed    6 non-null      int64 
 3   event        6 non-null      object
dtypes: int64(2), object(2)
memory usage: 320.0+ bytes


In [35]:
df = pd.read_csv('datasets/weather_data_nan.csv')
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/4/2017,,9.0,Sunny
2,1/5/2017,28.0,,Snow
3,1/6/2017,,7.0,
4,1/7/2017,32.0,,Rain
5,1/8/2017,,,Sunny
6,1/9/2017,,,
7,1/10/2017,34.0,8.0,Cloudy
8,1/11/2017,40.0,12.0,Sunny


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   day          9 non-null      object 
 1   temperature  5 non-null      float64
 2   windspeed    5 non-null      float64
 3   event        7 non-null      object 
dtypes: float64(2), object(2)
memory usage: 416.0+ bytes


# Basic statistics 

In [38]:
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,35,7,Sunny
2,1/3/2017,28,2,Snow
3,1/4/2017,24,7,Snow
4,1/5/2017,32,4,Rain
5,1/6/2017,31,2,Sunny


In [40]:
df.describe()

Unnamed: 0,temperature,windspeed
count,6.0,6.0
mean,30.333333,4.666667
std,3.829708,2.33809
min,24.0,2.0
25%,28.75,2.5
50%,31.5,5.0
75%,32.0,6.75
max,35.0,7.0


In [41]:
df.describe(include = 'object')

Unnamed: 0,day,event
count,6,6
unique,6,3
top,1/1/2017,Rain
freq,1,2


In [43]:
df['event'][2] = 'Sunny'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['event'][2] = 'Sunny'


In [44]:
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,35,7,Sunny
2,1/3/2017,28,2,Sunny
3,1/4/2017,24,7,Snow
4,1/5/2017,32,4,Rain
5,1/6/2017,31,2,Sunny


In [45]:
df.describe(include= 'object')

Unnamed: 0,day,event
count,6,6
unique,6,3
top,1/1/2017,Sunny
freq,1,3


In [46]:
df.describe(include = 'all')

Unnamed: 0,day,temperature,windspeed,event
count,6,6.0,6.0,6
unique,6,,,3
top,1/1/2017,,,Sunny
freq,1,,,3
mean,,30.333333,4.666667,
std,,3.829708,2.33809,
min,,24.0,2.0,
25%,,28.75,2.5,
50%,,31.5,5.0,
75%,,32.0,6.75,


# indexing and slicing

In [65]:
df = pd.read_csv('datasets/weather_data.csv')
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,35,7,Sunny
2,1/3/2017,28,2,Snow
3,1/4/2017,24,7,Snow
4,1/5/2017,32,4,Rain
5,1/6/2017,31,2,Sunny


In [66]:
df_1 = pd.read_csv('datasets/citibike_tripdata.csv')
df_1

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,end station id,end station name,bikeid,name_localizedValue,usertype
0,338,2018-05-01 00:04:47,2018-05-01 00:10:25,3639,Harborside,3199,Newport Pkwy,33558,Annual Membership,Subscriber
1,1482,2018-05-01 01:31:10,2018-05-01 01:55:53,3681,Grand St,3185,City Hall,33593,24 Hour,Customer
2,232,2018-05-01 01:31:29,2018-05-01 01:35:22,3194,McGinley Square,3193,Lincoln Park,29217,FREE Bonus Month with Annual Membership,Subscriber
3,190,2018-05-01 02:03:29,2018-05-01 02:06:40,3185,City Hall,3186,Grove St PATH,29662,24 Hour,Customer
4,303,2018-05-01 04:27:12,2018-05-01 04:32:16,3207,Oakland Ave,3195,Sip Ave,15271,Annual Membership,Subscriber
...,...,...,...,...,...,...,...,...,...,...
32423,396,2018-05-30 13:49:18,2018-05-30 13:55:55,3184,Paulus Hook,3279,Dixon Mills,29639,Join Citi Bike for $14.95/month,Subscriber
32424,313,2018-05-30 13:49:21,2018-05-30 13:54:35,3202,Newport PATH,3639,Harborside,26301,$25 Off Annual Membership,Subscriber
32425,316,2018-05-30 13:49:51,2018-05-30 13:55:08,3220,5 Corners Library,3195,Sip Ave,29260,Annual Membership,Subscriber
32426,1130,2018-05-30 13:50:52,2018-05-30 14:09:42,3281,Leonard Gordon Park,3213,Van Vorst Park,26239,Annual Membership,Subscriber


In [85]:
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,35,7,Sunny
2,1/3/2017,28,2,Snow
3,1/4/2017,24,7,Snow
4,1/5/2017,32,4,Rain
5,1/6/2017,31,2,Sunny


In [69]:
df.describe(include = 'object')

Unnamed: 0,day,event
count,6,6
unique,6,3
top,1/1/2017,Rain
freq,1,2


In [87]:
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,35,7,Sunny
2,1/3/2017,28,2,Snow
3,1/4/2017,24,7,Snow
4,1/5/2017,32,4,Rain
5,1/6/2017,31,2,Sunny


In [111]:
for i in df['event'].unique():
    print(f"{str(i)}  -----  {len(df.loc[df['event'] == str(i)])}")

Rain  -----  2
Sunny  -----  2
Snow  -----  2


In [97]:
len(df.loc[df['event'] == 'Rain'])

2

In [72]:
df['event'].unique()

array(['Rain', 'Sunny', 'Snow'], dtype=object)

In [73]:
df['event'].mode()

0     Rain
1     Snow
2    Sunny
Name: event, dtype: object

In [75]:
df_1.describe(include= 'object')

Unnamed: 0,starttime,stoptime,start station name,end station name,name_localizedValue,usertype
count,32428,32428,32428,32428,32428,32428
unique,31859,31899,50,61,32,2
top,2018-05-11 16:28:13,2018-05-25 09:10:30,Grove St PATH,Grove St PATH,Annual Membership,Subscriber
freq,3,3,3313,4720,20056,30053


In [79]:
df_1['start station name'].unique()

array(['Harborside', 'Grand St', 'McGinley Square', 'City Hall',
       'Oakland Ave', 'Baldwin at Montgomery', 'Hilltop',
       'Brunswick & 6th', 'Christ Hospital', 'Morris Canal',
       'Essex Light Rail', 'Marin Light Rail', 'Jersey & 6th St',
       'Leonard Gordon Park', 'Liberty Light Rail', 'Paulus Hook',
       'Newark Ave', 'Hamilton Park', 'Astor Place', 'Dixon Mills',
       'Pershing Field', 'Monmouth and 6th', 'Newport Pkwy',
       'Brunswick St', 'Union St', 'Grove St PATH',
       'Communipaw & Berry Lane', 'York St', 'Sip Ave', 'Van Vorst Park',
       'Warren St', 'Heights Elevator', 'Montgomery St', 'Riverview Park',
       'Columbus Drive', 'Exchange Place', 'Lafayette Park',
       'Fairmount Ave', 'Dey St', 'JC Medical Center', 'Jersey & 3rd',
       'Newport PATH', 'Manila & 1st', 'Lincoln Park', 'Washington St',
       'Bergen Ave', 'Glenwood Ave', 'Journal Square',
       '5 Corners Library', 'JCBS Depot'], dtype=object)

In [80]:
df_1['start station name'].mode()

0    Grove St PATH
Name: start station name, dtype: object

In [78]:
df_1

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,end station id,end station name,bikeid,name_localizedValue,usertype
0,338,2018-05-01 00:04:47,2018-05-01 00:10:25,3639,Harborside,3199,Newport Pkwy,33558,Annual Membership,Subscriber
1,1482,2018-05-01 01:31:10,2018-05-01 01:55:53,3681,Grand St,3185,City Hall,33593,24 Hour,Customer
2,232,2018-05-01 01:31:29,2018-05-01 01:35:22,3194,McGinley Square,3193,Lincoln Park,29217,FREE Bonus Month with Annual Membership,Subscriber
3,190,2018-05-01 02:03:29,2018-05-01 02:06:40,3185,City Hall,3186,Grove St PATH,29662,24 Hour,Customer
4,303,2018-05-01 04:27:12,2018-05-01 04:32:16,3207,Oakland Ave,3195,Sip Ave,15271,Annual Membership,Subscriber
...,...,...,...,...,...,...,...,...,...,...
32423,396,2018-05-30 13:49:18,2018-05-30 13:55:55,3184,Paulus Hook,3279,Dixon Mills,29639,Join Citi Bike for $14.95/month,Subscriber
32424,313,2018-05-30 13:49:21,2018-05-30 13:54:35,3202,Newport PATH,3639,Harborside,26301,$25 Off Annual Membership,Subscriber
32425,316,2018-05-30 13:49:51,2018-05-30 13:55:08,3220,5 Corners Library,3195,Sip Ave,29260,Annual Membership,Subscriber
32426,1130,2018-05-30 13:50:52,2018-05-30 14:09:42,3281,Leonard Gordon Park,3213,Van Vorst Park,26239,Annual Membership,Subscriber
