# 🏋 ex3 Python basics


## Getting help

You can access the help by executing the cell but getting help in a separate Python Console is less distracting.

In [1]:
help('list')  #Help on class list in module builtins
?list  #prints code documentation

Help on class list in module builtins:

class list(object)
 |  list(iterable=(), /)
 |  
 |  Built-in mutable sequence.
 |  
 |  If no argument is given, the constructor creates a new empty list.
 |  The argument must be an iterable if specified.
 |  
 |  Methods defined here:
 |  
 |  __add__(self, value, /)
 |      Return self+value.
 |  
 |  __contains__(self, key, /)
 |      Return key in self.
 |  
 |  __delitem__(self, key, /)
 |      Delete self[key].
 |  
 |  __eq__(self, value, /)
 |      Return self==value.
 |  
 |  __ge__(self, value, /)
 |      Return self>=value.
 |  
 |  __getattribute__(self, name, /)
 |      Return getattr(self, name).
 |  
 |  __getitem__(...)
 |      x.__getitem__(y) <==> x[y]
 |  
 |  __gt__(self, value, /)
 |      Return self>value.
 |  
 |  __iadd__(self, value, /)
 |      Implement self+=value.
 |  
 |  __imul__(self, value, /)
 |      Implement self*=value.
 |  
 |  __init__(self, /, *args, **kwargs)
 |      Initialize self.  See help(type(self))

## Python as a calculator

In [1]:
2 * 15 + 11

41

## Basic Types data structures

In [3]:
s1 = 'Hello'  #a string

s2 = " Python!"  #another string 

s3 = ''' Python
is fun'''  #a multiline string

a = 1  #int

b = 2.0  #float

c = True  #boolean true

d = False  #boolean false

print(s1 + s2 + s3)

print('a=' + str(a))

print('b is equal to', b, 'c and d are booleans equal to', c, 'and', d)

mylist = [1, 2.0, "3", '3', True, False]  #a list

print('mylist is', mylist)

mylist[0]  #indices start at 0!

#a dictionary map
mymap = {'setosa': '#a6cee3', 'versicolor': '#1f78b4', 'virginica': '#b2df8a'}

print('mymap is', mymap)

Hello Python! Python
is fun
a=1
b is equal to 2.0 c and d are booleans equal to True and False
mylist is [1, 2.0, '3', '3', True, False]
mymap is {'setosa': '#a6cee3', 'versicolor': '#1f78b4', 'virginica': '#b2df8a'}


## Functions

In [4]:
def f(x):  #declare f
    return x * x

f(2)  #call f with argument 2

4

## Packages

[NumPy](http://www.numpy.org) (N-dimensional array objects) and [Pandas](http://pandas.pydata.org) (dataframes built with NumPy) are the most relevant for data visualizations.

In [3]:
import numpy as np  #import numpy
import pandas as pd  #import pandas

Create a numpy array

In [4]:
arr = np.arange(6)  #?np.arange: Return evenly spaced values within a given interval.
np.random.shuffle(arr)
display(arr.size)
arr

6

array([0, 2, 3, 1, 5, 4])

Create a panda dataframe from numpy array

In [5]:
df = pd.DataFrame(arr)  #create Pandas dataframe from arr data
df

Unnamed: 0,0
0,0
1,2
2,3
3,1
4,5
5,4


Create a pandas dataframe with typed and named columns. Note that `pd.Categorical` is the equivalent of R `factors`.

In [6]:
df = pd.DataFrame({
    'A' : pd.Series([1, 2, 3, 4, 5, 6]),
    'B' : pd.Timestamp('20201001'),
    'C' : pd.Categorical(['male', 'female', 'female', 'female', 'male', 'male']),
    'D' : 'foo'})

display(df.C)

display(df.head())  #display first 5 rows
display(df.tail())  #last 5 rows

df.columns  #list columns names

0      male
1    female
2    female
3    female
4      male
5      male
Name: C, dtype: category
Categories (2, object): ['female', 'male']

Unnamed: 0,A,B,C,D
0,1,2020-10-01,male,foo
1,2,2020-10-01,female,foo
2,3,2020-10-01,female,foo
3,4,2020-10-01,female,foo
4,5,2020-10-01,male,foo


Unnamed: 0,A,B,C,D
1,2,2020-10-01,female,foo
2,3,2020-10-01,female,foo
3,4,2020-10-01,female,foo
4,5,2020-10-01,male,foo
5,6,2020-10-01,male,foo


Index(['A', 'B', 'C', 'D'], dtype='object')

Accessing dataframe elements

In [7]:
display(df.A)  #this is the preferred way to access column A
display(df.C[1])  #access by df[column = 'C'][row = 1]

0    1
1    2
2    3
3    4
4    5
5    6
Name: A, dtype: int64

'female'

Alternate way to access dataframe elements (sometimes needed, e.g., when creating a new column)

In [8]:
display(df['A'])  #access df[column = 'A']
display(df['C'][1])  #access df[column = 'C'][row = 1]

0    1
1    2
2    3
3    4
4    5
5    6
Name: A, dtype: int64

'female'

# Working with Data

## Load a dataset from a package

In [12]:
from sklearn import datasets
iris = datasets.load_iris()  #see the help: help(datasets)

df = pd.DataFrame(iris.data)  #create dataframe from iris.data
df.head()

Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


## Load a dataset in CSV format

In [13]:
import pandas as pd  #import pandas

df = pd.read_csv("data/heart-decease-cleveland.csv")
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


## List files in your drive:

In [None]:
!ls "./"

# Basic stats

## Descriptive statistics

In [14]:
df = pd.read_csv("data/heart-decease-cleveland.csv")
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,num
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.438944,0.679868,3.158416,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,0.937294
std,9.038662,0.467299,0.960126,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,1.228536
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0
50%,56.0,1.0,3.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,2.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,4.0


## Frequency table

In [33]:
arr = np.random.uniform(0, 100, 100)

df = pd.DataFrame({
    'Variable' : pd.Series(arr)
})

df['bin'] = pd.cut(df.Variable, [0, 20, 40, 60, 80, 100])
df = pd.value_counts(df.bin)  #count values for df.bin
df = df.to_frame('count').reset_index()
df = df.sort_values('index')
df['rf'] = df['count'] / len(df)

df['cf'] = df['rf'].cumsum()

df.columns = ['Value', 'No.', 'Rel. Freq.', 'Cum. Freq.']

df.reset_index(drop=True, inplace=True)

df

Unnamed: 0,Value,No.,Rel. Freq.,Cum. Freq.
0,"(0, 20]",13,2.6,2.6
1,"(20, 40]",25,5.0,7.6
2,"(40, 60]",20,4.0,11.6
3,"(60, 80]",27,5.4,17.0
4,"(80, 100]",15,3.0,20.0


---

# Exercises

## 😜 Exercise 1 

- Create a dataframe for the values `0, 1, 1, 2, 2, 3, 4, 15`
- use `df.describe()` to compute descriptive statistics

In [16]:
import numpy as np
import pandas as pd

df = pd.DataFrame(pd.Series([0, 1, 1, 2, 2, 3, 4, 15]))  #create dataframe from iris.data
df.describe()

Unnamed: 0,0
count,8.0
mean,3.5
std,4.810702
min,0.0
25%,1.0
50%,2.0
75%,3.25
max,15.0


## 😜 Exercise 2 

- Load `heart-decease-cleveland.csv` in a dataframe
- use `df.describe()` to compute descriptive statistics of all the variables

In [17]:
import numpy as np
import pandas as pd

df = pd.read_csv("data/heart-decease-cleveland.csv")
df.describe()



Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,num
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.438944,0.679868,3.158416,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,0.937294
std,9.038662,0.467299,0.960126,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,1.228536
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0
50%,56.0,1.0,3.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,2.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,4.0


## 🤔 Exercise 3

- Load `heart-decease-cleveland.csv` in a dataframe
- Create a frequency table of the `chol` variable for the frequency ranges:

```
(120, 160]
(160, 200]
(200, 240]
(240, 280]
(280, 320]
(320, 360]
(360, 400]
(400, 440]
```
- set `columns` to `'Chol mg/cl', 'No.', 'Rel. Freq.', 'Cum. Freq.'`

In [35]:
import numpy as np
import pandas as pd

df = pd.read_csv("data/heart-decease-cleveland.csv")

df['bin'] = pd.cut(df.chol, [120, 160, 200, 240, 280, 320, 360, 400, 440])
df = pd.value_counts(df.bin)  #count values for df.bin
df = df.to_frame('count').reset_index()
df = df.sort_values('index')
df['rf'] = df['count'] / len(df)

df['cf'] = df['rf'].cumsum()

df.columns = ['Chol mg/cl', 'No.', 'Rel. Freq.', 'Cum. Freq.']

df.reset_index(drop=True, inplace=True)

df

Unnamed: 0,Chol mg/cl,No.,Rel. Freq.,Cum. Freq.
0,"(120, 160]",7,0.875,0.875
1,"(160, 200]",43,5.375,6.25
2,"(200, 240]",101,12.625,18.875
3,"(240, 280]",81,10.125,29.0
4,"(280, 320]",50,6.25,35.25
5,"(320, 360]",16,2.0,37.25
6,"(360, 400]",1,0.125,37.375
7,"(400, 440]",3,0.375,37.75
