# review of pandas dataframe

In [1]:
import pandas as pd

In [8]:
# data series
# one col of data w/ defined idx
# each row has a idx
# associated w/ particular data type
raw = [23.4, 54.3, 34.5, 9.0, 22.3]
data = pd.Series(raw)
data

0    23.4
1    54.3
2    34.5
3     9.0
4    22.3
dtype: float64

In [9]:
type(data)

pandas.core.series.Series

In [11]:
raw2 = [23.4, 54.3, 34.5, 'a', 'b']
data2 = pd.Series(raw2)
data2

0    23.4
1    54.3
2    34.5
3       a
4       b
dtype: object

In [14]:
summary = data.describe()
summary

count     5.000000
mean     28.700000
std      16.927345
min       9.000000
25%      22.300000
50%      23.400000
75%      34.500000
max      54.300000
dtype: float64

In [16]:
# search for a specific info - dictionary
summary['50%']

23.4

In [17]:
# give info based on idx - list
summary[0]

5.0

In [21]:
# changing idx & overwriting default
data = pd.Series(raw, index=['M', 'T', 'W', 'R', 'F'])

In [22]:
data

M    23.4
T    54.3
W    34.5
R     9.0
F    22.3
dtype: float64

In [23]:
data['W']

34.5

In [24]:
# similar to dictionary
data = pd.Series({'Mon':23.4, 'Tue':54.3,
                  'Wed':34.5, 'Thu':9.0, 'Fri':22.3})
data

Mon    23.4
Tue    54.3
Wed    34.5
Thu     9.0
Fri    22.3
dtype: float64

In [25]:
# slicing but including the ending
data['Mon':'Wed']

Mon    23.4
Tue    54.3
Wed    34.5
dtype: float64

In [26]:
data.values

array([23.4, 54.3, 34.5,  9. , 22.3])

In [27]:
grades = pd.read_csv('grades.csv')
grades

Unnamed: 0,Test,Wally,Eva,Sam,Katie,Bob
0,T1,87,90,100,100,83
1,T2,96,87,77,81,90
2,T3,70,90,90,82,85


In [28]:
# col name
grades['Wally']

0    87
1    96
2    70
Name: Wally, dtype: int64

In [29]:
# col name
grades.Wally

0    87
1    96
2    70
Name: Wally, dtype: int64

In [30]:
grades[['Eva', 'Wally']]

Unnamed: 0,Eva,Wally
0,90,87
1,87,96
2,90,70


In [31]:
grades.Test

0    T1
1    T2
2    T3
Name: Test, dtype: object

In [32]:
grades.index

RangeIndex(start=0, stop=3, step=1)

In [33]:
grades.index = grades.Test

In [39]:
grades

Unnamed: 0_level_0,Wally,Eva,Sam,Katie,Bob
Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
T1,87,90,100,100,83
T2,96,87,77,81,90
T3,70,90,90,82,85


In [38]:
grades = grades.drop('Test', axis=1)

KeyError: "['Test'] not found in axis"

In [42]:
grades

Unnamed: 0_level_0,Wally,Eva,Sam,Katie,Bob
Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
T1,87,90,100,100,83
T2,96,87,77,81,90
T3,70,90,90,82,85


In [43]:
# extracting based on indx
# data series
grades.loc['T1']

Wally     87
Eva       90
Sam      100
Katie    100
Bob       83
Name: T1, dtype: int64

In [46]:
# table
grades.loc[['T1']]

# advantage => add rows
grades.loc[['T1', 'T3']][['Wally', 'Sam', 'Bob']]

Unnamed: 0_level_0,Wally,Sam,Bob
Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
T1,87,100,83
T3,70,90,85


In [48]:
# referencing num row
grades.iloc[1]

Wally    96
Eva      87
Sam      77
Katie    81
Bob      90
Name: T2, dtype: int64

In [49]:
grades.iloc[[1,0]]

Unnamed: 0_level_0,Wally,Eva,Sam,Katie,Bob
Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
T2,96,87,77,81,90
T1,87,90,100,100,83


In [51]:
# transpose row = col & col = rows
grades_t = grades.T
grades_t

Test,T1,T2,T3
Wally,87,96,70
Eva,90,87,90
Sam,100,77,90
Katie,100,81,82
Bob,83,90,85


In [54]:
# compute
# col by col - default
grades_t.mean()

Test
T1    92.0
T2    86.2
T3    83.4
dtype: float64

In [55]:
grades_t.mean(axis=0) # col by col
grades_t.mean(axis=1) # row by row

Wally    84.333333
Eva      89.000000
Sam      89.000000
Katie    87.666667
Bob      86.000000
dtype: float64

In [56]:
# adding new col 
grades_t['AVG'] = grades_t.mean(axis=1)
grades_t

Test,T1,T2,T3,AVG
Wally,87,96,70,84.333333
Eva,90,87,90,89.0
Sam,100,77,90,89.0
Katie,100,81,82,87.666667
Bob,83,90,85,86.0


In [57]:
grades

Unnamed: 0_level_0,Wally,Eva,Sam,Katie,Bob
Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
T1,87,90,100,100,83
T2,96,87,77,81,90
T3,70,90,90,82,85


In [58]:
grades['sk_avg'] = (grades.Sam + grades.Katie)/2
grades

Unnamed: 0_level_0,Wally,Eva,Sam,Katie,Bob,sk_avg
Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
T1,87,90,100,100,83,100.0
T2,96,87,77,81,90,79.0
T3,70,90,90,82,85,86.0


In [60]:
# extracting data based on conditions 
grades_t[grades_t.AVG >= 87]


Test,T1,T2,T3,AVG
Eva,90,87,90,89.0
Sam,100,77,90,89.0
Katie,100,81,82,87.666667


In [62]:
# ordering from lowest to highest based on specified 
# wont change the original
grades.sort_values(by='Sam')

Unnamed: 0_level_0,Wally,Eva,Sam,Katie,Bob,sk_avg
Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
T2,96,87,77,81,90,79.0
T3,70,90,90,82,85,86.0
T1,87,90,100,100,83,100.0


In [63]:
grades.sort_values(by='Sam', ascending=False)

Unnamed: 0_level_0,Wally,Eva,Sam,Katie,Bob,sk_avg
Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
T1,87,90,100,100,83,100.0
T3,70,90,90,82,85,86.0
T2,96,87,77,81,90,79.0


In [64]:
# modify the current table
grades.sort_values(by='Sam', ascending=False, inplace=True)

In [65]:
grades

Unnamed: 0_level_0,Wally,Eva,Sam,Katie,Bob,sk_avg
Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
T1,87,90,100,100,83,100.0
T3,70,90,90,82,85,86.0
T2,96,87,77,81,90,79.0


In [66]:
# applying function where cols are the parameters
df = pd.DataFrame({'name':['a','b','c'], 'x':[10,15,29]})
df

Unnamed: 0,name,x
0,a,10
1,b,15
2,c,29


In [67]:
# name of col in brackets bc creating new col
df['y'] = df.x * 2
df

Unnamed: 0,name,x,y
0,a,10,20
1,b,15,30
2,c,29,58


In [68]:
def f(x,y):
    return x+ y**2

In [70]:
# giving 2 col as parameters
df['fxy']=f(df.x, df.y)
df

Unnamed: 0,name,x,y,fxy
0,a,10,20,410
1,b,15,30,915
2,c,29,58,3393


In [72]:
x_series = df.x
y_series = df.y

In [73]:
x_series

0    10
1    15
2    29
Name: x, dtype: int64

In [74]:
y_series **2

0     400
1     900
2    3364
Name: y, dtype: int64

In [75]:
import math
def mytrig(x):
    return math.cos(x) + math.sin(x**2)

In [78]:
mytrig(2)

-1.1729493318550706

In [77]:
# cannot do w/ series
math.cos(y_series)

TypeError: cannot convert the series to <class 'float'>

In [79]:
# apply row of num to function
# vectorizing the function
# take one row of data & call the func on the values in that row
df['mytrig'] = df.apply(lambda row: mytrig(row.x), axis=1)

In [80]:
df

Unnamed: 0,name,x,y,fxy,mytrig
0,a,10,20,410,-1.345437
1,b,15,30,915,-1.689783
2,c,29,58,3393,-1.559626
