# Pandas

In [3]:
#pandas created in 2008 by Wes McKinney + 2012 => book "Python for Data Analysis" update in 2017
#Open Source New BSD licences with 100 diffrent contributors (strong community)
#planet python : http://planetpython.org/ => blog aggregator for python related new 
#contains python tutorials
#Podcast on Python : Python Bytes by Michael Kennedy and Brian Okken => python news related => keep up 
#with some activity in the python ecosystem
#Podcast on DS : Data Skeptic (2014) => covers DS with lessons, interviews, trends and share community
#project (OpenHouse)

In [4]:
#The serie => core of data structure => cross btw list and dictionary
#items are stored in an order + labels => retrieve them. To visualize this in 2 column
#1st col = index => like dictionary and the 2nd col = actual data.
#tha data col has a label of its own => retrieve using .name attribute => useful to merge multiple
#col of data

In [5]:
import pandas as pd

In [6]:
students = ['Willy','Tyty','Manu']
pd.Series(students)

0    Willy
1     Tyty
2     Manu
dtype: object

In [7]:
#Pandas stores series values in a typed array using numpy library.
numbers = [1,2,3]
pd.Series(numbers) #rslt dtype = int64

0    1
1    2
2    3
dtype: int64

In [8]:
#Some typing details exist for performance : How Pandas handle missing data
#Pandas => do type conversion for us

students = ['Willy','Tyty',None]
pd.Series(students)

0    Willy
1     Tyty
2     None
dtype: object

In [9]:
#if list of numbers, int, float => None type converted by Pandas as a floating point value => NaN
numbers = [1,2,None]
pd.Series(numbers)  #NaN = different value + Pandas converted dtype as float64 instead of object or int

0    1.0
1    2.0
2    NaN
dtype: float64

In [10]:
#RQ: wondering with list of int = float => exist missing values.
#NaN and None handle the same way by DS but theses values => differents reprensentation by Pandas
#NaN not equivalent to None

import numpy as np
np.nan == None

False

In [11]:
np.nan == np.nan #use special function to test the presence of not a number (NaN)

False

In [12]:
np.isnan(np.nan) #Non equivalence due to efficency reasons

True

In [13]:
#Series can be created using dictionary data
students_scores = {'Willy':'Maths','Tyty':'English','Manu':'Physics'}
s = pd.Series(students_scores)
s

Willy      Maths
Tyty     English
Manu     Physics
dtype: object

In [14]:
s.index #dtype of object is not only for strings

Index(['Willy', 'Tyty', 'Manu'], dtype='object')

In [15]:
students=[('Willy','Sanchew'),('Tyty','Nguyen'),('Manu','Pierrat')]
pd.Series(students)
#We see that each tuples are stored in series object + type = object

0    (Willy, Sanchew)
1      (Tyty, Nguyen)
2     (Manu, Pierrat)
dtype: object

In [16]:
s = pd.Series(['Maths','English','Physics'], index = ['Willy','Tyty','Manu'])
s

Willy      Maths
Tyty     English
Manu     Physics
dtype: object

In [17]:
#What happens if the list of values in the index => not aligned with the keys in my dictionnary
#for creating the series. Pandas overrides the auto creation to favor only and all indices values 
#that i provided => ignore from my dict all keys not in my index + Pandas add None or NaN values

students_scores = {'Willy':'Maths','Tyty':'English','Manu':'Physics'}
s = pd.Series(students_scores, index=['Willy','Manu','Camille'])
s

Willy        Maths
Manu       Physics
Camille        NaN
dtype: object

## Querying a Series

In [18]:
#Pandas Series => can be queried either by index position or index label
#numeric location => iloc attribute
#index label => loc attribute
students_classes = {'Willy':'Maths','Tyty':'English','Manu':'Physics','Camille':'Info'}
s = pd.Series(students_classes)
s


Willy        Maths
Tyty       English
Manu       Physics
Camille       Info
dtype: object

In [19]:
s.iloc[3]

'Info'

In [20]:
s.loc['Tyty']

'English'

In [21]:
#iloc and loc are attribute. Methods requires ()
s[3]==s.iloc[3]

True

In [22]:
s['Tyty']==s.loc['Tyty']

True

In [23]:
# What if my index = list of integer => complicated and Pandas can't deter auto if i intend to query 
# by index label or index position => Cautius => use loc and iloc attribute

class_code = {99:'Maths',
              100:'Physics',
              101:'English',
              102:'Info'}
s = pd.Series(class_code)
s

99       Maths
100    Physics
101    English
102       Info
dtype: object

In [24]:
#s[0] => Keyerreur 
s.iloc[0]

'Maths'

In [25]:
grades = pd.Series([90,80,70,60])

total = 0
for grade in grades:
    total += grade
print(total/len(grades))

75.0


In [26]:
#it works but it slow. Modern computers => do tasks simultaneously, especially maths tasks
# Pandas and Numpy =>support a method of computation called vectorization.
#Vectorization => works with most of functions in numpy library
#Vectorization => ability for a computer to exe multiple instructions at once + high performance
#with graphic card => dynamic speedups. Modern graphics cards => run thousands of instructions in 
#parallel.

import numpy as np 
total = np.sum(grades) #Vectorization method
print(total/len(grades))

75.0


In [27]:
#Two methods => same rslt but which one is faster. Jupyter Notebook => fonction to help => timeit.

numbers = pd.Series(np.random.randint(0,1000,10000)) #10000 numbers btw 0-1000
numbers.head() #top 5 items

0    702
1    336
2    798
3     82
4    366
dtype: int32

In [28]:
len(numbers)

10000

In [29]:
#ipython interpreter => contains magic functions => begins "%". on commence par % + Tab key 
# => see a list of available functions (RQ: Possible to write my own magic function)
#timeit. => run our code a few times => determine on average computational time of the methods
# can give the numbers of loops that i would like to run

"""%%timeit -n 100 #100 loops
total = 0
for number in grades:
    total += number
    
total/len(numbers) """ 

#RQ : Pour utiliser une magic function => doit être la 1ère ligne de la cellule Jupyter

'%%timeit -n 100 #100 loops\ntotal = 0\nfor number in grades:\n    total += number\n    \ntotal/len(numbers) '

In [30]:
%%timeit -n 100 #100 loops 
total = 0
for number in grades:
    total += number
    
total/len(numbers)

4.4 µs ± 90.2 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [31]:
%%timeit -n 100
total = np.sum(numbers)
total/len(numbers) #DOR problem niveau tps
#Normalement la vectorization est la plus rapide

123 µs ± 30.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [32]:
#Related feature => broadcasting => apply an operation to every value in the Series.
numbers.head()

0    702
1    336
2    798
3     82
4    366
dtype: int32

In [33]:
numbers+=2
numbers.head()

0    704
1    338
2    800
3     84
4    368
dtype: int32

In [34]:
#iteritems() => returns a label + value
"""for label, value in numbers.iteritems():
    numbers.at[label,value + 2] #in recent pandas versions .set_value() = .at()
numbers.head() """ 

'for label, value in numbers.iteritems():\n    numbers.at[label,value + 2] #in recent pandas versions .set_value() = .at()\nnumbers.head() '

In [35]:
#iteritems() => returns a label + value
"""for label, value in numbers.iteritems():
    numbers.set_value(label,value+2) #in recent pandas versions .set_value() = .at()
numbers.head()"""

'for label, value in numbers.iteritems():\n    numbers.set_value(label,value+2) #in recent pandas versions .set_value() = .at()\nnumbers.head()'

In [36]:
%%timeit -n 10
s = pd.Series(np.random.randint(1,1000,1000))
for label, value in s.iteritems():
    s.loc[label] = value+2
    

54.5 ms ± 2.27 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [37]:
%%timeit -n 10
s = pd.Series(np.random.randint(1,1000,1000))
s+=2 #broadcasting

298 µs ± 140 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [38]:
#.loc attribute => modify data in place + add new data if value we pass in  as the index doesn´t 
#exist => new entry
#RQ: index can have mixed types

s = pd.Series([1,2,3])
s.loc['History'] = 102
s

0            1
1            2
2            3
History    102
dtype: int64

In [39]:
#Last ex where index value aren't unique => Pandas Series a little diff conceptually
#as relational database.

students_classes = pd.Series({'Willy':'Maths','Tyty':'English','Manu':'Physics','Camille':'info'})
students_classes

Willy        Maths
Tyty       English
Manu       Physics
Camille       info
dtype: object

In [40]:
Jean_classes = pd.Series(['Maths','Physics','Art'], index=['Jean','Jean','Jean'])
Jean_classes

Jean      Maths
Jean    Physics
Jean        Art
dtype: object

In [41]:
#Append all the data => .append()
all_students_classes = students_classes.append(Jean_classes)
all_students_classes

Willy        Maths
Tyty       English
Manu       Physics
Camille       info
Jean         Maths
Jean       Physics
Jean           Art
dtype: object

In [42]:
#Pandas will try to infer the best data types to use
#append method => doesn't change the underlying Series objects
students_classes

Willy        Maths
Tyty       English
Manu       Physics
Camille       info
dtype: object

In [43]:
#If we try to appended series for Jean => not a single value but a series itself
all_students_classes.loc['Jean']

Jean      Maths
Jean    Physics
Jean        Art
dtype: object

## DataFrame

In [44]:
#Pandas two dimensional series object, data structure => DataFrame => similar of the series object 
#but with multiple col of data 

In [45]:
record1 = pd.Series({'Name':'Willy','Class':'Maths','Score':'85'})
record2 = pd.Series({'Name':'Tyty','Class':'Info','Score':'82'})
record3 = pd.Series({'Name':'Manu','Class':'Physics','Score':'90'})

In [46]:
df = pd.DataFrame([record1,record2,record3],index=['school1','school2','school3'])
df.head()

Unnamed: 0,Name,Class,Score
school1,Willy,Maths,85
school2,Tyty,Info,82
school3,Manu,Physics,90


In [47]:
#Jupyter creates a nice HTML to renders rslt of df 
#An alternative method = list of dict, where each dict = row of data
students = [{'Name':'Willy','Class':'Maths','Score':'85'},{'Name':'Tyty','Class':'Info','Score':'82'},
           {'Name':'Manu','Class':'Physics','Score':'90'}]
df = pd.DataFrame(students,index=['school1','school2','school1'])
df.head()

Unnamed: 0,Name,Class,Score
school1,Willy,Maths,85
school2,Tyty,Info,82
school1,Manu,Physics,90


In [48]:
#Extract data with loc and iloc attributes. But this time Dataframe => 2 dim => loc[i,j]
df.loc['school2']

Name     Tyty
Class    Info
Score      82
Name: school2, dtype: object

In [49]:
type(df.loc['school2'])

pandas.core.series.Series

In [50]:
df.loc['school1']

Unnamed: 0,Name,Class,Score
school1,Willy,Maths,85
school1,Manu,Physics,90


In [51]:
type(df.loc['school1'])

pandas.core.frame.DataFrame

In [52]:
#If we are interested in school1 student names
df.loc['school1','Name']

school1    Willy
school1     Manu
Name: Name, dtype: object

In [53]:
#How to select a single column ? 
#1: transpose the matrix => pivots all the rows into col => T attribute
df.T.loc['Name']

school1    Willy
school2     Tyty
school1     Manu
Name: Name, dtype: object

In [54]:
#However, loc and iloc used for row selection => Pandas reserves the index operators directly on the 
#DF for column selection. In Panda's DF col always have a name. So selection => alway label based

df['Name']

school1    Willy
school2     Tyty
school1     Manu
Name: Name, dtype: object

In [55]:
#Note that the rslt of single col projection => Series Object
type(df['Name'])

pandas.core.series.Series

In [56]:
df.loc['school1']['Name']

school1    Willy
school1     Manu
Name: Name, dtype: object

In [57]:
#If i'm confused => check the response  => type()
print(type(df.loc['school1']))
print(type(df.loc['school1']['Name']))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [58]:
#RQ: Chaining tends to cause Pandas to return a copy of the DF instead of a view on the DF
#For selecting data => not big deal. But it might slower than necessary
#For changing data => important distinction => source of error

In [59]:
#We ask for all the names and scores for all schools using .loc ope
df.loc[:,['Name','Score']]

Unnamed: 0,Name,Score
school1,Willy,85
school2,Tyty,82
school1,Manu,90


In [60]:
#RQ: That's Select+Project data from DF based on row and col. Key concepts to remember
#=> rows and col are just for our benefits. Underneath => just 2 axes label array and transposing
#col is easy. Try to avoid the issue of Chaining

In [61]:
#dropping data
#drop function doesn't change the DF by default, instead it returns the copy of DF with the rows removed

df.drop('school1')


Unnamed: 0,Name,Class,Score
school2,Tyty,Info,82


In [62]:
df # still intact

Unnamed: 0,Name,Class,Score
school1,Willy,Maths,85
school2,Tyty,Info,82
school1,Manu,Physics,90


In [63]:
#Drop function => contains an interesting param = inplace. If the inplace = True => updates DF
# instead of a copy. Second param = axes, by dft = 0 => rows, if axes =1 => col

copy_df = df.copy()
copy_df.drop('Name',inplace=True,axis=1)
copy_df

Unnamed: 0,Class,Score
school1,Maths,85
school2,Info,82
school1,Physics,90


In [64]:
#2nd way to drop a col => through indexing operator => with del keyword => immediate effect on DF
del copy_df['Class']
copy_df

Unnamed: 0,Score
school1,85
school2,82
school1,90


In [65]:
#Adding a new col to the DF => easy as assigning it to some value using the indexing operator
#To add class ranking col with default value of None

df['ClassRanking'] = None
df

Unnamed: 0,Name,Class,Score,ClassRanking
school1,Willy,Maths,85,
school2,Tyty,Info,82,
school1,Manu,Physics,90,


## DataFrame Indexing and Loading

In [66]:
#Jupyter Notebook use ipython as the Kernel underneath => provide convenient ways to integrate lower 
#level shell commands which are programs run in the underlying operating system
#use shell command => "cat" => concatenate => outputs the contents of a file
#in ipython => we prepend the line with "!" => exe the reminder of the line as a shell command
#cat function => unix. For Windows => type function

!type Admission_predict.csv 

Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR ,CGPA,Research,Chance of Admit 
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4,4.5,8.87,1,0.76
3,316,104,3,3,3.5,8,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2,3,8.21,0,0.65
6,330,115,5,4.5,3,9.34,1,0.9
7,321,109,3,3,4,8.2,1,0.75
8,308,101,2,3,4,7.9,0,0.68
9,302,102,1,2,1.5,8,0,0.5
10,323,108,3,3.5,3,8.6,0,0.45
11,325,106,3,3.5,4,8.4,1,0.52
12,327,111,4,4,4.5,9,1,0.84
13,328,112,4,4,4.5,9.1,1,0.78
14,307,109,3,4,3,8,1,0.62
15,311,104,3,3.5,2,8.2,1,0.61
16,314,105,3,3.5,2.5,8.3,0,0.54
17,317,107,3,4,3,8.7,0,0.66
18,319,106,3,4,3,8,1,0.65
19,318,110,3,4,3,8.8,0,0.63
20,303,102,3,3.5,3,8.5,0,0.62
21,312,107,3,3,2,7.9,1,0.64
22,325,114,4,3,2,8.4,0,0.7
23,328,116,5,5,5,9.5,1,0.94
24,334,119,5,5,4.5,9.7,1,0.95
25,336,119,5,4,3.5,9.8,1,0.97
26,340,120,5,4.5,4.5,9.6,1,0.94
27,322,109,5,4.5,3.5,8.8,0,0.76
28,298,98,2,1.5,2.5,7.5,1,0.44
29,295,93,1,2,2,7.2,0,0.46
30,310,99,2,1.5,2,7.3,0,0.54
31,300,97,2,3,3,8.1,1,0.65
32,327,103,3,

In [67]:
#Pandas mades it easy to turn a CSV into a DF => read_csv() function
df = pd.read_csv('Admission_predict.csv ')
df.head()

255,321,114,4,4,5,9.12,0,0.85
256,307,110,4,4,4.5,8.37,0,0.79
257,309,99,3,4,4,8.56,0,0.76
258,324,100,3,4,5,8.64,1,0.78
259,326,102,4,5,5,8.76,1,0.77
260,331,119,4,5,4.5,9.34,1,0.9
261,327,108,5,5,3.5,9.13,1,0.87
262,312,104,3,3.5,4,8.09,0,0.71
263,308,103,2,2.5,4,8.36,1,0.7
264,324,111,3,2.5,1.5,8.79,1,0.7
265,325,110,2,3,2.5,8.76,1,0.75
266,313,102,3,2.5,2.5,8.68,0,0.71
267,312,105,2,2,2.5,8.45,0,0.72
268,314,107,3,3,3.5,8.17,1,0.73
269,327,113,4,4.5,5,9.14,0,0.83
270,308,108,4,4.5,5,8.34,0,0.77
271,306,105,2,2.5,3,8.22,1,0.72
272,299,96,2,1.5,2,7.86,0,0.54
273,294,95,1,1.5,1.5,7.64,0,0.49
274,312,99,1,1,1.5,8.01,1,0.52
275,315,100,1,2,2.5,7.95,0,0.58
276,322,110,3,3.5,3,8.96,1,0.78
277,329,113,5,5,4.5,9.45,1,0.89
278,320,101,2,2.5,3,8.62,0,0.7
279,308,103,2,3,3.5,8.49,0,0.66
280,304,102,2,3,4,8.73,0,0.67
281,311,102,3,4.5,4,8.64,1,0.68
282,317,110,3,4,4.5,9.11,1,0.8
283,312,106,3,4,3.5,8.79,1,0.81
284,321,111,3,2.5,3,8.9,1,0.8
285,340,112,4,5,4.5,9.66,1,0.94
286,331,116,5,4,4,9.26,

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [68]:
#We see that index starts with 0 while Serial Number starts from 1
df = pd.read_csv('Admission_predict.csv ',index_col=0)
df.head()

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [69]:
#Let's change the name of SOP and LOR col names => more precise. 
#Use rename() it takes as param columns, need to pass into a dict which the keys are the old column
#name and the value new col name

new_df = df.rename(columns={'GRE Score':'GRE Score','TOEFL Score':'TOEFL Score',
                           'University Rating':'University Rating',
                           'SOP':'Statement of Purpose','LOR':'Letter of Recommendation',
                           'CGPA':'CGPA','Research':'Research',	
                           'Chance of Admit':'Chance of Admit'})
new_df.head()

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,Statement of Purpose,LOR,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [70]:
#Why LOR doesn't have changed ? 
#1st investigate that all the columns names are correct => columns attribute
new_df.columns

Index(['GRE Score', 'TOEFL Score', 'University Rating', 'Statement of Purpose',
       'LOR ', 'CGPA', 'Research', 'Chance of Admit '],
      dtype='object')

In [71]:
#We see that there is a space after LOR and Chance of Admit 
#1st way change column name including the space
new_df = df.rename(columns={'LOR ':'Letter of Recommendation','Chance of Admit ':'Chance of Admit'})
new_df.head()

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,Letter of Recommendation,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [72]:
# If tab or more space ? 
# 2nd way => create a function => does cleaning then renamed to apply that function across all the Data
#strip() function for handy string

new_df = new_df.rename(mapper=str.strip, axis='columns')
new_df.head()


Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,Letter of Recommendation,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [73]:
#RQ: rename doesn't modify the DF => just a copy in new_df with the changed name.
df.columns #PB with col is very common

Index(['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR ', 'CGPA',
       'Research', 'Chance of Admit '],
      dtype='object')

In [74]:
#Another way => attribute the good list of column to df.columns => directly modify the DF =>
#efficiency when there is a lot of columns + want to change a few. Technique => not affected by
#subtle errors in col names
#Let's change of the column name to lower case. 
cols = list(df.columns)
cols = [x.lower().strip() for x in cols]
df.columns = cols
df.head()

Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


## Indexing DataFrame

In [75]:
#Another way for setting an index is to use the set_index(). Param = list of columns and promotes
#those col to an index. set_index() => destructive process => doesn't keep the current index.
#Need to create a new col + copy into it values from index attribute.

df = pd.read_csv('Admission_predict.csv', index_col=0)
df.head()

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [76]:
#We want the index to be the Chance of Admit, but we want the Serial No. for later analysis =>
#new col.
df['Serial Number'] = df.index #copy the index data into its own column
df = df.set_index('Chance of Admit ')
df.head()

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Serial Number
Chance of Admit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.92,337,118,4,4.5,4.5,9.65,1,1
0.76,324,107,4,4.0,4.5,8.87,1,2
0.72,316,104,3,3.0,3.5,8.0,1,3
0.8,322,110,3,3.5,2.5,8.67,1,4
0.65,314,103,2,2.0,3.0,8.21,0,5


In [77]:
#We can get rid of the index => reset_index(), promote the index into a col + create a dft numb index
df = df.reset_index()
df.head()

Unnamed: 0,Chance of Admit,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Serial Number
0,0.92,337,118,4,4.5,4.5,9.65,1,1
1,0.76,324,107,4,4.0,4.5,8.87,1,2
2,0.72,316,104,3,3.0,3.5,8.0,1,3
3,0.8,322,110,3,3.5,2.5,8.67,1,4
4,0.65,314,103,2,2.0,3.0,8.21,0,5


In [78]:
#Pandas => multi-lvl indexing => similar to composite keys in the relational DB system
#To create => multi-lvl index => call set_index + give it a list of col that we're interested in
#promoting to an index.
#Good ex = geographical data which is sorted by region or demographics

df = pd.read_csv('census.csv')
df.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [79]:
df.columns

Index(['age', 'workclass', 'final-weight', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loos', 'hour-per-week', 'native-country',
       'income'],
      dtype='object')

In [80]:
df.shape

(32561, 15)

In [81]:
print(len(df['race'].unique()))
print(df['race'].unique())

5
[' White' ' Black' ' Asian-Pac-Islander' ' Amer-Indian-Eskimo' ' Other']


In [82]:
df=df[df['race']== ' Black']
df.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
10,37,Private,280464,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,0,80,United-States,>50K
13,32,Private,205019,Assoc-acdm,12,Never-married,Sales,Not-in-family,Black,Male,0,0,50,United-States,<=50K


In [83]:
df.shape

(3124, 15)

In [84]:
#Now we are going to look at to just the total population estimates + total number of Capital gain
#If we want to reduce the data =>

columns_to_keep = ['age', 'workclass', 'final-weight', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loos', 'hour-per-week', 'native-country',
       'income']
df = df[columns_to_keep]
df.head(10)

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
10,37,Private,280464,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,0,80,United-States,>50K
13,32,Private,205019,Assoc-acdm,12,Never-married,Sales,Not-in-family,Black,Male,0,0,50,United-States,<=50K
21,54,Private,302146,HS-grad,9,Separated,Other-service,Unmarried,Black,Female,0,0,20,United-States,<=50K
22,35,Federal-gov,76845,9th,5,Married-civ-spouse,Farming-fishing,Husband,Black,Male,0,0,40,United-States,<=50K
31,20,Private,266015,Some-college,10,Never-married,Sales,Own-child,Black,Male,0,0,44,United-States,<=50K
34,22,State-gov,311512,Some-college,10,Married-civ-spouse,Other-service,Husband,Black,Male,0,0,15,United-States,<=50K
45,57,Federal-gov,337895,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,Black,Male,0,0,40,United-States,>50K


In [85]:
df = df.set_index(['native-country','marital-status']) #dual index
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,age,workclass,final-weight,education,education-num,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,income
native-country,marital-status,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
United-States,Married-civ-spouse,53,Private,234721,11th,7,Handlers-cleaners,Husband,Black,Male,0,0,40,<=50K
Cuba,Married-civ-spouse,28,Private,338409,Bachelors,13,Prof-specialty,Wife,Black,Female,0,0,40,<=50K
Jamaica,Married-spouse-absent,49,Private,160187,9th,5,Other-service,Not-in-family,Black,Female,0,0,16,<=50K
United-States,Married-civ-spouse,37,Private,280464,Some-college,10,Exec-managerial,Husband,Black,Male,0,0,80,>50K
United-States,Never-married,32,Private,205019,Assoc-acdm,12,Sales,Not-in-family,Black,Male,0,0,50,<=50K


In [87]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,age,workclass,final-weight,education,education-num,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,income
native-country,marital-status,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
United-States,Married-civ-spouse,53,Private,234721,11th,7,Handlers-cleaners,Husband,Black,Male,0,0,40,<=50K
Cuba,Married-civ-spouse,28,Private,338409,Bachelors,13,Prof-specialty,Wife,Black,Female,0,0,40,<=50K
Jamaica,Married-spouse-absent,49,Private,160187,9th,5,Other-service,Not-in-family,Black,Female,0,0,16,<=50K
United-States,Married-civ-spouse,37,Private,280464,Some-college,10,Exec-managerial,Husband,Black,Male,0,0,80,>50K
United-States,Never-married,32,Private,205019,Assoc-acdm,12,Sales,Not-in-family,Black,Male,0,0,50,<=50K
United-States,...,...,...,...,...,...,...,...,...,...,...,...,...,...
United-States,Married-civ-spouse,33,Private,273243,HS-grad,9,Craft-repair,Husband,Black,Male,0,0,40,<=50K
United-States,Never-married,22,Private,325033,12th,8,Protective-serv,Own-child,Black,Male,0,0,35,<=50K
United-States,Never-married,30,Private,345898,HS-grad,9,Craft-repair,Not-in-family,Black,Male,0,0,46,<=50K
United-States,Divorced,38,Private,139180,Bachelors,13,Prof-specialty,Unmarried,Black,Female,15020,0,45,>50K


In [90]:
#How we can query this DF => loc => row and col. When we use MulitIndex => provide the argument
#in order by that the lvl we wish to query
df.loc[' United-States',' Married-civ-spouse']

  df.loc[' United-States',' Married-civ-spouse']


Unnamed: 0_level_0,Unnamed: 1_level_0,age,workclass,final-weight,education,education-num,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,income
native-country,marital-status,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
United-States,Married-civ-spouse,53,Private,234721,11th,7,Handlers-cleaners,Husband,Black,Male,0,0,40,<=50K
United-States,Married-civ-spouse,37,Private,280464,Some-college,10,Exec-managerial,Husband,Black,Male,0,0,80,>50K
United-States,Married-civ-spouse,35,Federal-gov,76845,9th,5,Farming-fishing,Husband,Black,Male,0,0,40,<=50K
United-States,Married-civ-spouse,22,State-gov,311512,Some-college,10,Other-service,Husband,Black,Male,0,0,15,<=50K
United-States,Married-civ-spouse,57,Federal-gov,337895,Bachelors,13,Prof-specialty,Husband,Black,Male,0,0,40,>50K
United-States,...,...,...,...,...,...,...,...,...,...,...,...,...,...
United-States,Married-civ-spouse,38,Private,257416,9th,5,Transport-moving,Husband,Black,Male,0,0,40,<=50K
United-States,Married-civ-spouse,64,State-gov,222966,7th-8th,4,Other-service,Wife,Black,Female,0,0,40,<=50K
United-States,Married-civ-spouse,61,Private,355645,HS-grad,9,Sales,Husband,Black,Male,0,0,40,<=50K
United-States,Married-civ-spouse,40,Private,142657,Assoc-voc,11,Craft-repair,Husband,Black,Male,0,0,45,<=50K


In [92]:
#If we want to compare 2 counties Native country => list of tuples describing the indices we wish to
#query. Since MultiIndex of 2 values => native country + marital status => provide 2 values as each 
#element of our filtering list. Each tuple should have 2 elements.

df


Unnamed: 0_level_0,Unnamed: 1_level_0,age,workclass,final-weight,education,education-num,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,income
native-country,marital-status,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
United-States,Married-civ-spouse,53,Private,234721,11th,7,Handlers-cleaners,Husband,Black,Male,0,0,40,<=50K
Cuba,Married-civ-spouse,28,Private,338409,Bachelors,13,Prof-specialty,Wife,Black,Female,0,0,40,<=50K
Jamaica,Married-spouse-absent,49,Private,160187,9th,5,Other-service,Not-in-family,Black,Female,0,0,16,<=50K
United-States,Married-civ-spouse,37,Private,280464,Some-college,10,Exec-managerial,Husband,Black,Male,0,0,80,>50K
United-States,Never-married,32,Private,205019,Assoc-acdm,12,Sales,Not-in-family,Black,Male,0,0,50,<=50K
United-States,...,...,...,...,...,...,...,...,...,...,...,...,...,...
United-States,Married-civ-spouse,33,Private,273243,HS-grad,9,Craft-repair,Husband,Black,Male,0,0,40,<=50K
United-States,Never-married,22,Private,325033,12th,8,Protective-serv,Own-child,Black,Male,0,0,35,<=50K
United-States,Never-married,30,Private,345898,HS-grad,9,Craft-repair,Not-in-family,Black,Male,0,0,46,<=50K
United-States,Divorced,38,Private,139180,Bachelors,13,Prof-specialty,Unmarried,Black,Female,15020,0,45,>50K


In [93]:
df.loc[ [(' United-States',' Married-civ-spouse'),
        (' United-States',' Married-spouse-absent')] ]
#That's how hierarchical indices work. Hierarchical Labeling isn't only for rows.
#It's possible to transpose a matrix => Hierarchical columns labels
#Projecting a single column => has the same label => works as the same idea 

Unnamed: 0_level_0,Unnamed: 1_level_0,age,workclass,final-weight,education,education-num,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,income
native-country,marital-status,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
United-States,Married-civ-spouse,53,Private,234721,11th,7,Handlers-cleaners,Husband,Black,Male,0,0,40,<=50K
United-States,Married-civ-spouse,37,Private,280464,Some-college,10,Exec-managerial,Husband,Black,Male,0,0,80,>50K
United-States,Married-civ-spouse,35,Federal-gov,76845,9th,5,Farming-fishing,Husband,Black,Male,0,0,40,<=50K
United-States,Married-civ-spouse,22,State-gov,311512,Some-college,10,Other-service,Husband,Black,Male,0,0,15,<=50K
United-States,Married-civ-spouse,57,Federal-gov,337895,Bachelors,13,Prof-specialty,Husband,Black,Male,0,0,40,>50K
United-States,...,...,...,...,...,...,...,...,...,...,...,...,...,...
United-States,Married-spouse-absent,37,Local-gov,97136,Some-college,10,Adm-clerical,Unmarried,Black,Female,0,0,40,<=50K
United-States,Married-spouse-absent,33,Private,288840,HS-grad,9,Other-service,Unmarried,Black,Female,0,0,38,<=50K
United-States,Married-spouse-absent,32,Private,143604,10th,6,Other-service,Not-in-family,Black,Female,0,0,37,<=50K
United-States,Married-spouse-absent,41,Private,239833,HS-grad,9,Transport-moving,Unmarried,Black,Male,0,0,50,<=50K


## Querying a DataFrame

In [1]:
#Boolean masking = heart of fast and efficient querying in Numpy, Pandas
#BM = array => 1 dim as Series and 2 dim as DataFrame. Value = True or False. 
#BM superposé on top of the data structure that we are querying.
#Any cell = True => admitted into final rslt

In [2]:
import pandas as pd
df = pd.read_csv('Admission_Predict.csv',index_col=0)
df.columns = [x.lower().strip() for x in df.columns]
df.head(10)

Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65
6,330,115,5,4.5,3.0,9.34,1,0.9
7,321,109,3,3.0,4.0,8.2,1,0.75
8,308,101,2,3.0,4.0,7.9,0,0.68
9,302,102,1,2.0,1.5,8.0,0,0.5
10,323,108,3,3.5,3.0,8.6,0,0.45


In [4]:
#BM created by apply operators directly to pandas Series or DF
#We want to see students with a chance of admit >0.7 
# = Broadcasting a comparison oper > than, with rslt = Boolean series.
admit_mask = df['chance of admit'] > 0.7
admit_mask

Serial No.
1       True
2       True
3       True
4       True
5      False
       ...  
396     True
397     True
398     True
399    False
400     True
Name: chance of admit, Length: 400, dtype: bool

In [5]:
#Underneath, Pandas is applying compar oper via Vectorization 
#After BM we can lay it on top of the data => hide data we don't want = False values
# .where() function on the original DF.
df.where(admit_mask).head()

Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337.0,118.0,4.0,4.5,4.5,9.65,1.0,0.92
2,324.0,107.0,4.0,4.0,4.5,8.87,1.0,0.76
3,316.0,104.0,3.0,3.0,3.5,8.0,1.0,0.72
4,322.0,110.0,3.0,3.5,2.5,8.67,1.0,0.8
5,,,,,,,,


In [6]:
#Rslt data keeps original index values, all data don't meet CI => NaN values instead
#Next step = dropna()
df.where(admit_mask).dropna().head()

Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337.0,118.0,4.0,4.5,4.5,9.65,1.0,0.92
2,324.0,107.0,4.0,4.0,4.5,8.87,1.0,0.76
3,316.0,104.0,3.0,3.0,3.5,8.0,1.0,0.72
4,322.0,110.0,3.0,3.5,2.5,8.67,1.0,0.8
6,330.0,115.0,5.0,4.5,3.0,9.34,1.0,0.9


In [7]:
#The index doesn't contain 5. where() isn't often used. Instead pandas dev => created funct 
#combine both funct
df[df['chance of admit'] >0.7].head()

Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
6,330,115,5,4.5,3.0,9.34,1,0.9


In [8]:
df['gre score'].head()

Serial No.
1    337
2    324
3    316
4    322
5    314
Name: gre score, dtype: int64

In [9]:
df[['gre score','toefl score']].head()

Unnamed: 0_level_0,gre score,toefl score
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1
1,337,118
2,324,107
3,316,104
4,322,110
5,314,103


In [10]:
df[df['gre score'] > 320].head() #send it a boolean mask 
# Combine funct .loc() or .where().dropna().

Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
4,322,110,3,3.5,2.5,8.67,1,0.8
6,330,115,5,4.5,3.0,9.34,1,0.9
7,321,109,3,3.0,4.0,8.2,1,0.75


In [11]:
#Combine multiple BM => multiple criteria for including
#In bitmasking in other places in computer science => 'and', if both masks must be True or 'or'
#Unfortunatly => not natural in Pandas
# (df['chance of admit'] > 0.7) and (df['chance of admit'] <0.9) => Valueerror
#PB python doesn't know how to compare 2 series object using 'and' or 'or'
#Authors have overwritten the pipe '|' and '&' to handle this
(df['chance of admit'] > 0.7) & (df['chance of admit'] <0.9) 

Serial No.
1      False
2       True
3       True
4       True
5      False
       ...  
396     True
397     True
398    False
399    False
400    False
Name: chance of admit, Length: 400, dtype: bool

In [13]:
#Need to watch out for is order of operations => Erreur fréquente des débutants de Pandas
#est d'oublier de mettre les () autour de chaque Serie puis l'opérateur entre => () & ()
# df['chance of admit'] > 0.7 & df['chance of admit'] <0.9 => TypeError
#Bc Pandas tries to bitwise and a 0.7 and a Pandas DF
#2nd Method : without comparison oper >
df['chance of admit'].gt(0.7) & df['chance of admit'].lt(0.9)


Serial No.
1      False
2       True
3       True
4       True
5      False
       ...  
396     True
397     True
398    False
399    False
400    False
Name: chance of admit, Length: 400, dtype: bool

In [14]:
#3rd Method: works if the oper needed is built into the DF 
df['chance of admit'].gt(0.7).lt(0.9)

Serial No.
1      False
2      False
3      False
4      False
5       True
       ...  
396    False
397    False
398    False
399     True
400    False
Name: chance of admit, Length: 400, dtype: bool

## Missing values 

In [17]:
#If i'm running a survey and a respondant didn't answer the question => missing value = omission 
# Called *Missing at Random*, if other variable can predict the missing values as:ethnicity, gender
#If there is not relationship to other variables => *Missing Completely at Random (MCAR)*
#Exist many more cases of missing value
#Pandas is pretty good at detecting missing values direct from underlying data formats, like CSV 
#as NaN, NULL, None, N/A

df = pd.read_csv('class_grades.csv')
df.head(10)

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,5,57.14,34.09,64.38,51.48,52.5
1,8,95.05,105.49,67.5,99.07,68.33
2,8,83.7,83.17,,63.15,48.89
3,7,,,49.38,105.93,80.56
4,8,91.32,93.64,95.0,107.41,73.89
5,7,95.0,92.58,93.12,97.78,68.06
6,8,95.05,102.99,56.25,99.07,50.0
7,7,72.85,86.85,60.0,,56.11
8,8,84.26,93.1,47.5,18.52,50.83
9,7,90.1,97.55,51.25,88.89,63.61


In [18]:
#isnull() => create a boolean mask of the whole DF. effectively broadcasting
mask = df.isnull()
mask.head(10)

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,True,False,False
3,False,True,True,False,False,False
4,False,False,False,False,False,False
5,False,False,False,False,False,False
6,False,False,False,False,False,False
7,False,False,False,False,True,False
8,False,False,False,False,False,False
9,False,False,False,False,False,False


In [19]:
df.dropna().head(10) #But index 2, 3, 7 and 11 are gone

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,5,57.14,34.09,64.38,51.48,52.5
1,8,95.05,105.49,67.5,99.07,68.33
4,8,91.32,93.64,95.0,107.41,73.89
5,7,95.0,92.58,93.12,97.78,68.06
6,8,95.05,102.99,56.25,99.07,50.0
8,8,84.26,93.1,47.5,18.52,50.83
9,7,90.1,97.55,51.25,88.89,63.61
10,7,80.44,90.2,75.0,91.48,39.72
12,8,97.16,103.71,72.5,93.52,63.33
13,7,91.28,83.53,81.25,99.81,92.22


In [20]:
#But index 2, 3, 7 and 11 are gone, other funct : filling NA => fillna()
#Most DF operator => copy and no modify the Df => Inplace = True
df.fillna(0, inplace=True) 
df.head(10)

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,5,57.14,34.09,64.38,51.48,52.5
1,8,95.05,105.49,67.5,99.07,68.33
2,8,83.7,83.17,0.0,63.15,48.89
3,7,0.0,0.0,49.38,105.93,80.56
4,8,91.32,93.64,95.0,107.41,73.89
5,7,95.0,92.58,93.12,97.78,68.06
6,8,95.05,102.99,56.25,99.07,50.0
7,7,72.85,86.85,60.0,0.0,56.11
8,8,84.26,93.1,47.5,18.52,50.83
9,7,90.1,97.55,51.25,88.89,63.61


In [42]:
#use na_filter option to turn off white space filtering. if white space = actual value of interest
#Important sometimes to consider that missing values as acutally having information 
df = pd.read_table('log.txt', delimiter = "\t")
df.head(20)

Unnamed: 0,time,user,video,playback position,paused,volume
0,1469974424,cheryl,intro.html,5,False,10.0
1,1469974454,cheryl,intro.html,6,,
2,1469974544,cheryl,intro.html,9,,
3,1469974574,cheryl,intro.html,10,,
4,1469977514,bob,intro.html,1,,
5,1469977544,bob,intro.html,1,,
6,1469977574,bob,intro.html,1,,
7,1469977604,bob,intro.html,1,,
8,1469974604,cheryl,intro.html,11,,
9,1469974694,cheryl,intro.html,14,,


In [43]:
#time col = timestamp in the Unix epoch format. 
# Playback position increase by 1 => time stamp increase by 30s except for Bob
#Bob seems to have paused his playback position => increases doesn't change
#Not easy to try to knowledge from the data bc it's not sorted

In [44]:
#Next up is => paramater() => 2 common fill values are ffill and bfill. 
#forward filling => updates an NA value for a particular cell with the value of the previous row
#backward filling => opposite ffill
#Important => for this funct, data must be sorted
#In Pandas we can sort by Index or Values
df = df.set_index('time')
df = df.sort_index()
df.head(20)

Unnamed: 0_level_0,user,video,playback position,paused,volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1469974424,cheryl,intro.html,5,False,10.0
1469974424,sue,advanced.html,23,False,10.0
1469974454,cheryl,intro.html,6,,
1469974454,sue,advanced.html,24,,
1469974484,cheryl,intro.html,7,,
1469974514,cheryl,intro.html,8,,
1469974524,sue,advanced.html,25,,
1469974544,cheryl,intro.html,9,,
1469974554,sue,advanced.html,26,,
1469974574,cheryl,intro.html,10,,


In [45]:
#We see that the index isn't a unique value bc to user can une the system at the same time
#Let's reset the index + use multi-lvl indexing on time AND user together 

df = df.reset_index()
df = df.set_index(['time','user'])
df = df.sort_index()
df.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,video,playback position,paused,volume
time,user,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1469974424,cheryl,intro.html,5,False,10.0
1469974424,sue,advanced.html,23,False,10.0
1469974454,cheryl,intro.html,6,,
1469974454,sue,advanced.html,24,,
1469974484,cheryl,intro.html,7,,
1469974514,cheryl,intro.html,8,,
1469974524,sue,advanced.html,25,,
1469974544,cheryl,intro.html,9,,
1469974554,sue,advanced.html,26,,
1469974574,cheryl,intro.html,10,,


In [46]:
#Now the data is sorted => we can use ffill methods. Impt : Don't have to fix all NaN in one command

df = df.fillna(method='ffill')
df.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,video,playback position,paused,volume
time,user,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1469974424,cheryl,intro.html,5,False,10.0
1469974424,sue,advanced.html,23,False,10.0
1469974454,cheryl,intro.html,6,False,10.0
1469974454,sue,advanced.html,24,False,10.0
1469974484,cheryl,intro.html,7,False,10.0
1469974514,cheryl,intro.html,8,False,10.0
1469974524,sue,advanced.html,25,False,10.0
1469974544,cheryl,intro.html,9,False,10.0
1469974554,sue,advanced.html,26,False,10.0
1469974574,cheryl,intro.html,10,False,10.0


In [47]:
#We can do customized fill-in to replace with replace() funct => allows replacement fr several approach
# => value-to-value, list, dict, regex 
df = pd.DataFrame({'A':[1,1,3,4,5],
                   'B':[3,5,6,7,3],
                   'C':['a','b','c','d','e']})
df

Unnamed: 0,A,B,C
0,1,3,a
1,1,5,b
2,3,6,c
3,4,7,d
4,5,3,e


In [48]:
#Let's replace 1's with 100
df.replace(1,100)

Unnamed: 0,A,B,C
0,100,3,a
1,100,5,b
2,3,6,c
3,4,7,d
4,5,3,e


In [49]:
#How about to change more values => list. change 1's to 100 and 3's to 300
df.replace([1,3],[100,300])

Unnamed: 0,A,B,C
0,100,300,a
1,100,5,b
2,300,6,c
3,4,7,d
4,5,300,e


In [50]:
#Now with regex
df = pd.read_table('log.txt', delimiter = "\t")
df.head(20)

Unnamed: 0,time,user,video,playback position,paused,volume
0,1469974424,cheryl,intro.html,5,False,10.0
1,1469974454,cheryl,intro.html,6,,
2,1469974544,cheryl,intro.html,9,,
3,1469974574,cheryl,intro.html,10,,
4,1469977514,bob,intro.html,1,,
5,1469977544,bob,intro.html,1,,
6,1469977574,bob,intro.html,1,,
7,1469977604,bob,intro.html,1,,
8,1469974604,cheryl,intro.html,11,,
9,1469974694,cheryl,intro.html,14,,


In [53]:
#To replace using regex we make 1st paramater to replace the regex pattern we want to match
#2nd param : the value we want to emit upon match
#3rd param : regex = True

#I want to detect all html pages in the 'video' col => end with '.html' and i want to overwrite that
# with the keyword 'webpage'

df.replace(to_replace='.*.html$',value='webpage',regex=True)

Unnamed: 0,time,user,video,playback position,paused,volume
0,1469974424,cheryl,webpage,5,False,10.0
1,1469974454,cheryl,webpage,6,,
2,1469974544,cheryl,webpage,9,,
3,1469974574,cheryl,webpage,10,,
4,1469977514,bob,webpage,1,,
5,1469977544,bob,webpage,1,,
6,1469977574,bob,webpage,1,,
7,1469977604,bob,webpage,1,,
8,1469974604,cheryl,webpage,11,,
9,1469974694,cheryl,webpage,14,,


In [54]:
#RQ: statistical funct on DF ignore NaN => we can think of replace NaN by mean value. But we need
# to think of the value that are being excluded. Missing Values really matters depending on the pb

## Manipulating DataFrame

In [37]:
#Try to do a basic data cleaning process + More Pandas API function
#df = pd.read_table('presidents.txt',delimiter='\t')
import pandas as pd
df = pd.read_csv('us_presidents.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,S.No.,start,end,president,prior,party,vice
0,0,1,"April 30, 1789","March 4, 1797",George Washington,Commander-in-Chief of the Continental Army ...,Nonpartisan [13],John Adams
1,1,2,"March 4, 1797","March 4, 1801",John Adams,1st Vice President of the United States,Federalist,Thomas Jefferson
2,2,3,"March 4, 1801","March 4, 1809",Thomas Jefferson,2nd Vice President of the United States,Democratic- Republican,Aaron Burr
3,3,4,"March 4, 1809","March 4, 1817",James Madison,5th United States Secretary of State (1801–...,Democratic- Republican,George Clinton
4,4,5,"March 4, 1817","March 4, 1825",James Monroe,7th United States Secretary of State (1811–...,Democratic- Republican,Daniel D. Tompkins


In [38]:
df['First']=df['president']
df['First']=df['First'].replace('[ ].*','',regex=True)
df.head()

Unnamed: 0.1,Unnamed: 0,S.No.,start,end,president,prior,party,vice,First
0,0,1,"April 30, 1789","March 4, 1797",George Washington,Commander-in-Chief of the Continental Army ...,Nonpartisan [13],John Adams,George
1,1,2,"March 4, 1797","March 4, 1801",John Adams,1st Vice President of the United States,Federalist,Thomas Jefferson,John
2,2,3,"March 4, 1801","March 4, 1809",Thomas Jefferson,2nd Vice President of the United States,Democratic- Republican,Aaron Burr,Thomas
3,3,4,"March 4, 1809","March 4, 1817",James Madison,5th United States Secretary of State (1801–...,Democratic- Republican,George Clinton,James
4,4,5,"March 4, 1817","March 4, 1825",James Monroe,7th United States Secretary of State (1811–...,Democratic- Republican,Daniel D. Tompkins,James


In [39]:
#It works, but it slow and not efficiencly readeable
del(df['First']) #drop the col we've created
#apply() function on a DF => take arbitrary funct that i've written and apply it to Series (Single col)
#or DF across all rows and col

In [40]:
def splitname(row):
    #row = Single Series
    #Let's extract the firstname and create a new entry series
    row['First']=row['president'].split(' ')[0]
    row['Last']=row['president'].split(' ')[-1]
    return row

df = df.apply(splitname, axis = 'columns')
df.head()

Unnamed: 0.1,Unnamed: 0,S.No.,start,end,president,prior,party,vice,First,Last
0,0,1,"April 30, 1789","March 4, 1797",George Washington,Commander-in-Chief of the Continental Army ...,Nonpartisan [13],John Adams,George,Washington
1,1,2,"March 4, 1797","March 4, 1801",John Adams,1st Vice President of the United States,Federalist,Thomas Jefferson,John,Adams
2,2,3,"March 4, 1801","March 4, 1809",Thomas Jefferson,2nd Vice President of the United States,Democratic- Republican,Aaron Burr,Thomas,Jefferson
3,3,4,"March 4, 1809","March 4, 1817",James Madison,5th United States Secretary of State (1801–...,Democratic- Republican,George Clinton,James,Madison
4,4,5,"March 4, 1817","March 4, 1825",James Monroe,7th United States Secretary of State (1811–...,Democratic- Republican,Daniel D. Tompkins,James,Monroe


In [41]:
del(df['First'])
del(df['Last'])

In [42]:
#Extract take regex as input and specifically requires me to set capture groups that correspond
#output col i'm interested in. 
pattern = '(^[\w]*)(?:.* )([\w]*$)' #2nd group = '?:' bc i want this group not to be returned + 
#i want any number of characters followed by white space
#extract() funct is built into str attribute of Series object 
#One single column that i'm taking out of a DF => projected as a Series   
df['president'].str.extract(pattern).head()

Unnamed: 0,0,1
0,George,Washington
1,John,Adams
2,Thomas,Jefferson
3,James,Madison
4,James,Monroe


In [43]:
#PB: we want column name
pattern = '(?P<First>^[\w]*)(?:.* )(?P<Last>[\w]*$)' #no name for 2nd group bc won't be returned
names = df['president'].str.extract(pattern)
names

Unnamed: 0,First,Last
0,George,Washington
1,John,Adams
2,Thomas,Jefferson
3,James,Madison
4,James,Monroe
5,John,Adams
6,Andrew,Jackson
7,Martin,Buren
8,William,Harrison
9,John,Tyler


In [44]:
#Copy them into our main DF 
df['First'] = names['First']
df['Last'] = names['Last']
df.head(10)

Unnamed: 0.1,Unnamed: 0,S.No.,start,end,president,prior,party,vice,First,Last
0,0,1,"April 30, 1789","March 4, 1797",George Washington,Commander-in-Chief of the Continental Army ...,Nonpartisan [13],John Adams,George,Washington
1,1,2,"March 4, 1797","March 4, 1801",John Adams,1st Vice President of the United States,Federalist,Thomas Jefferson,John,Adams
2,2,3,"March 4, 1801","March 4, 1809",Thomas Jefferson,2nd Vice President of the United States,Democratic- Republican,Aaron Burr,Thomas,Jefferson
3,3,4,"March 4, 1809","March 4, 1817",James Madison,5th United States Secretary of State (1801–...,Democratic- Republican,George Clinton,James,Madison
4,4,5,"March 4, 1817","March 4, 1825",James Monroe,7th United States Secretary of State (1811–...,Democratic- Republican,Daniel D. Tompkins,James,Monroe
5,5,6,"March 4, 1825","March 4, 1829",John Quincy Adams,8th United States Secretary of State (1817–...,Democratic- Republican,John C. Calhoun,John,Adams
6,6,7,"March 4, 1829","March 4, 1837",Andrew Jackson,U.S. Senator ( Class 2 ) from Tennessee ...,Democratic,John C. Calhoun,Andrew,Jackson
7,7,8,"March 4, 1837","March 4, 1841",Martin Van Buren,8th Vice President of the United States,Democratic,Richard Mentor Johnson,Martin,Buren
8,8,9,"March 4, 1841","April 4, 1841",William Henry Harrison,United States Minister to Colombia (1828–1829),Whig,John Tyler,William,Harrison
9,9,10,"April 4, 1841","March 4, 1845",John Tyler,10th Vice President of the United States,"Whig April 4, 1841 – September 13, 1841",Office vacant,John,Tyler


In [45]:
df['start'] = df['start'].str.extract('([\w]{1,10} [\w]{1,2}, [\w]{4})')
df['end'] = df['end'].str.extract('([\w]{1,10} [\w]{1,2}, [\w]{4})')
df.head()

Unnamed: 0.1,Unnamed: 0,S.No.,start,end,president,prior,party,vice,First,Last
0,0,1,"April 30, 1789","March 4, 1797",George Washington,Commander-in-Chief of the Continental Army ...,Nonpartisan [13],John Adams,George,Washington
1,1,2,"March 4, 1797","March 4, 1801",John Adams,1st Vice President of the United States,Federalist,Thomas Jefferson,John,Adams
2,2,3,"March 4, 1801","March 4, 1809",Thomas Jefferson,2nd Vice President of the United States,Democratic- Republican,Aaron Burr,Thomas,Jefferson
3,3,4,"March 4, 1809","March 4, 1817",James Madison,5th United States Secretary of State (1801–...,Democratic- Republican,George Clinton,James,Madison
4,4,5,"March 4, 1817","March 4, 1825",James Monroe,7th United States Secretary of State (1811–...,Democratic- Republican,Daniel D. Tompkins,James,Monroe


In [46]:
df['start'].head()

0    April 30, 1789
1     March 4, 1797
2     March 4, 1801
3     March 4, 1809
4     March 4, 1817
Name: start, dtype: object

In [47]:
#that's clean up the data format, but the type of this col = object and Pandas deals with str
#Pandas has date/time features. Wes McKinnsey put his effort into the library to deal with financial
#transactions
df['start']=pd.to_datetime(df['start'])
df['end']=pd.to_datetime(df['end'])
df['start'].head()

0   1789-04-30
1   1797-03-04
2   1801-03-04
3   1809-03-04
4   1817-03-04
Name: start, dtype: datetime64[ns]

In [48]:
df.head()

Unnamed: 0.1,Unnamed: 0,S.No.,start,end,president,prior,party,vice,First,Last
0,0,1,1789-04-30,1797-03-04,George Washington,Commander-in-Chief of the Continental Army ...,Nonpartisan [13],John Adams,George,Washington
1,1,2,1797-03-04,1801-03-04,John Adams,1st Vice President of the United States,Federalist,Thomas Jefferson,John,Adams
2,2,3,1801-03-04,1809-03-04,Thomas Jefferson,2nd Vice President of the United States,Democratic- Republican,Aaron Burr,Thomas,Jefferson
3,3,4,1809-03-04,1817-03-04,James Madison,5th United States Secretary of State (1801–...,Democratic- Republican,George Clinton,James,Madison
4,4,5,1817-03-04,1825-03-04,James Monroe,7th United States Secretary of State (1811–...,Democratic- Republican,Daniel D. Tompkins,James,Monroe
