# Chapter 3: Managing and preparing data



Let us begin with simple computations in Python using +,-,* and / operators and functions.

In [1]:
import numpy as np
import pandas as pd

weight = np.array([60,72,57,90,95,72])
height = np.array([1.75,1.8,1.65,1.9,1.74,1.91])
gender = np.array(["m","f","m","f","f","m"])
ghw  = pd.DataFrame({"gender":gender,"height":height,"weight":weight})
ghw

Unnamed: 0,gender,height,weight
0,m,1.75,60
1,f,1.8,72
2,m,1.65,57
3,f,1.9,90
4,f,1.74,95
5,m,1.91,72


In [12]:
ghw.iloc[0,0]

'm'

Commonly used functions like sqrt(),exp(), and log() are made available in the `math` package.

In [13]:
print(ghw.iloc[:,1:3])

   height  weight
0    1.75      60
1    1.80      72
2    1.65      57
3    1.90      90
4    1.74      95
5    1.91      72


In [14]:
ghw.iloc[1:5,:]

Unnamed: 0,gender,height,weight
1,f,1.8,72
2,m,1.65,57
3,f,1.9,90
4,f,1.74,95


In [15]:
ghw.iloc[:,0]

0    m
1    f
2    m
3    f
4    f
5    m
Name: gender, dtype: object

In [16]:
ghw.iloc[[0,2,5],1:3]

Unnamed: 0,height,weight
0,1.75,60
2,1.65,57
5,1.91,72


In [17]:
x = ghw.iloc[0:3,:]
x.shape

(3, 3)

In [18]:
0/0

ZeroDivisionError: division by zero

In [19]:
y = 0
0 if y == 0 or np.isnan(y) else x/y 
#0 if y == 0  else x/y

0

In [20]:
df = pd.DataFrame({'a': [0,-1], 'b': [2,3]})
print(df)
df /= 0
print(df)

   a  b
0  0  2
1 -1  3
     a    b
0  NaN  inf
1 -inf  inf


In [21]:
df['a'].isna()

0     True
1    False
Name: a, dtype: bool

In [22]:
df = pd.DataFrame({'a': [1,2], 'b': [3,4]})
display(df)
df2 = df.reindex(['x'])
df2

Unnamed: 0,a,b
0,1,3
1,2,4


Unnamed: 0,a,b
x,,


In [23]:
x = True
not not 2<=2

True

## Specifiying Conditions

In [24]:
student = pd.read_csv("../data/student.csv")
student.head()

Unnamed: 0,id,gender,math,prog,daysabs
0,1001,0,63,2,4
1,1002,0,27,2,4
2,1003,1,20,2,2
3,1004,1,16,2,3
4,1005,1,2,2,3


In [25]:
x1 = student[student['daysabs']==0]
print(x1.head())

      id  gender  math  prog  daysabs
15  1016       0    89     2        0
17  1018       1    35     2        0
21  1022       0    61     2        0
23  1024       0    63     2        0
27  1028       1    21     2        0


In [26]:
x2 = student[(student['daysabs']==0) & (student['math']>60)]
print(x2.head())

      id  gender  math  prog  daysabs
15  1016       0    89     2        0
21  1022       0    61     2        0
23  1024       0    63     2        0
34  1035       0    68     2        0
70  1071       0    72     2        0


In [27]:
f1 = (student['gender']==0) & (student['prog']==3) & (student['math']>60)
f1

0      False
1      False
2      False
3      False
4      False
       ...  
309    False
310    False
311    False
312    False
313    False
Length: 314, dtype: bool

In [28]:
x3 = student[f1]
print(x3.head())

       id  gender  math  prog  daysabs
88   1089       0    84     3        4
164  2007       0    71     3        0
166  2009       0    71     3        0
168  2011       0    77     3        2
172  2015       0    65     3        1


## Factor Variables

In [29]:
arthritis = pd.read_csv("../data/Arthritis.csv")
arthritis.head()

Unnamed: 0.1,Unnamed: 0,ID,Treatment,Sex,Age,Improved
0,1,57,Treated,Male,27,Some
1,2,46,Treated,Male,29,
2,3,77,Treated,Male,30,
3,4,17,Treated,Male,32,Marked
4,5,36,Treated,Male,46,Marked


In [30]:
arthritis['Sex'].value_counts()

Female    59
Male      25
Name: Sex, dtype: int64

In [44]:
arthritis[['Treatment','Improved']].value_counts().unstack()

Improved,Marked,None,Some
Treatment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Placebo,7,29,7
Treated,21,13,7


In [32]:
arthritis[['Treatment','Improved']].value_counts(normalize=True).unstack()

Improved,Marked,None,Some
Treatment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Placebo,0.083333,0.345238,0.083333
Treated,0.25,0.154762,0.083333


In [46]:
arthritis.groupby('Improved')['Treatment'].value_counts(normalize=True).unstack()

Treatment,Placebo,Treated
Improved,Unnamed: 1_level_1,Unnamed: 2_level_1
Marked,0.25,0.75
,0.690476,0.309524
Some,0.5,0.5


In [47]:
arthritis.groupby('Treatment')['Improved'].value_counts(normalize=True).unstack()

Improved,Marked,None,Some
Treatment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Placebo,0.162791,0.674419,0.162791
Treated,0.512195,0.317073,0.170732


In [49]:
df3 = arthritis[['Sex','Treatment','Improved']].value_counts().unstack()[['None','Some','Marked']]
df3

Unnamed: 0_level_0,Improved,None,Some,Marked
Sex,Treatment,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,Placebo,19.0,7.0,6.0
Female,Treated,6.0,5.0,16.0
Male,Placebo,10.0,,1.0
Male,Treated,7.0,2.0,5.0


In [37]:
#df4 = df3.reset_index(level='Improved')
#display(df4)
df4 = df3.unstack()
df4

Improved,Marked,Marked,None,None,Some,Some
Treatment,Placebo,Treated,Placebo,Treated,Placebo,Treated
Sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Female,6.0,16.0,19.0,6.0,7.0,5.0
Male,1.0,5.0,10.0,7.0,,2.0


## Numeric Variables

In [110]:
whiteside = pd.read_csv('../data/whiteside.csv',index_col=0)
whiteside.head()

Unnamed: 0_level_0,Temp,Gas
Insul,Unnamed: 1_level_1,Unnamed: 2_level_1
Before,-0.8,7.2
Before,-0.7,6.9
Before,0.4,6.4
Before,2.5,6.0
Before,2.9,5.8


In [14]:
whiteside.groupby(['Insul'])['Temp'].agg(np.mean)

Insul
After     4.463333
Before    5.350000
Name: Temp, dtype: float64

In [23]:
whiteside.groupby(['Insul'])['Temp'].mean()

Insul
After     4.463333
Before    5.350000
Name: Temp, dtype: float64