In [1]:
#load the library and check its version
import pandas as pd
pd.__version__

'1.5.1'

<p><span style="font-size: 18pt;">Series vs. Dataframes</span></p>
<p><span style="font-size: 14pt;"><span style="color: #0a0a23; font-family: Lato, sans-serif;">&middot; Series: one-dimensional object that can hold any data type such as integers, floats and strings.</span></span></p>
<p><span style="font-size: 14pt;"><span style="color: #0a0a23; font-family: Lato, sans-serif;">&middot; </span><span style="color: #0a0a23; font-family: Lato, sans-serif;">DataFrame: two dimensional object that can have columns with potential different types</span></span></p>
<p>&nbsp;</p>
<p><span style="font-size: 14pt;"><span style="color: #0a0a23; font-family: Lato, sans-serif;">Dataframes are the most comonly used objects in Pandas</span></span></p>

In [2]:
#create a series
x = pd.Series([6,3,4,10])
x

0     6
1     3
2     4
3    10
dtype: int64

In [3]:
#series with custom indexes
x = pd.Series([6,3,4,10], index=['a', 'b', 'c', 'd'])
print(x)

print(f"Access element with index c --> x['c']: {x['c']}")

a     6
b     3
c     4
d    10
dtype: int64
Access element with index c --> x['c']: 4


In [4]:
#create a data frame - dictionary is used here where keys get converted to column names and values to row values.
df = pd.DataFrame({'Origin': ['Spain','Colombia','Chile','Equador','Spain','Colombia'],
                     'Age':[23, 44, 25, 25, 54, 44],
                     'Gender': ['male', 'female', 'female', 'male', 'male', 'female']})
display(df)

Unnamed: 0,Origin,Age,Gender
0,Spain,23,male
1,Colombia,44,female
2,Chile,25,female
3,Equador,25,male
4,Spain,54,male
5,Colombia,44,female


In [5]:
#Inspect n first elements of dataframe
df.head(3)

Unnamed: 0,Origin,Age,Gender
0,Spain,23,male
1,Colombia,44,female
2,Chile,25,female


In [6]:
#Inspect n last elements
df.tail(2)

Unnamed: 0,Origin,Age,Gender
4,Spain,54,male
5,Colombia,44,female


In [7]:
#Inspect n random elements
df.sample(4)

Unnamed: 0,Origin,Age,Gender
3,Equador,25,male
1,Colombia,44,female
4,Spain,54,male
0,Spain,23,male


<p><span style="font-size: 14pt;">Describe a DataFrame and Access Data</span></p>

In [8]:
#Display info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Origin  6 non-null      object
 1   Age     6 non-null      int64 
 2   Gender  6 non-null      object
dtypes: int64(1), object(2)
memory usage: 272.0+ bytes


In [9]:
#Statistical description
df.describe()

Unnamed: 0,Age
count,6.0
mean,35.833333
std,13.136463
min,23.0
25%,25.0
50%,34.5
75%,44.0
max,54.0


In [10]:
#get values of a row
display(df.loc[2])

Origin     Chile
Age           25
Gender    female
Name: 2, dtype: object

In [11]:
#get multiple rows
df.loc[2:4]

Unnamed: 0,Origin,Age,Gender
2,Chile,25,female
3,Equador,25,male
4,Spain,54,male


In [12]:
#get a column by name
df['Origin'] #sames as df.Origin

0       Spain
1    Colombia
2       Chile
3     Equador
4       Spain
5    Colombia
Name: Origin, dtype: object

In [13]:
#get multiple columns by name
df[['Origin','Age']]

Unnamed: 0,Origin,Age
0,Spain,23
1,Colombia,44
2,Chile,25
3,Equador,25
4,Spain,54
5,Colombia,44


In [14]:
#get a subset of rows and columns
df.loc[2:3,['Origin','Age']]

Unnamed: 0,Origin,Age
2,Chile,25
3,Equador,25


In [15]:
#select rows by columns values
df[df['Age']>30]

Unnamed: 0,Origin,Age,Gender
1,Colombia,44,female
4,Spain,54,male
5,Colombia,44,female


<p><span style="font-size: 14pt;">Basic Operations on DataFrames</span></p>

In [16]:
df

Unnamed: 0,Origin,Age,Gender
0,Spain,23,male
1,Colombia,44,female
2,Chile,25,female
3,Equador,25,male
4,Spain,54,male
5,Colombia,44,female


In [17]:
#Count different items in a column
df['Origin'].value_counts()

Spain       2
Colombia    2
Chile       1
Equador     1
Name: Origin, dtype: int64

In [18]:
#sort dataframe by the values of a column
df2 = df.sort_values(by=['Age'],ascending=True).reset_index()
df2

Unnamed: 0,index,Origin,Age,Gender
0,0,Spain,23,male
1,2,Chile,25,female
2,3,Equador,25,male
3,1,Colombia,44,female
4,5,Colombia,44,female
5,4,Spain,54,male


In [19]:
#remove duplicates and reset indexes
df.drop_duplicates().sort_values(by=['Age','Gender'],ascending=[True,False],inplace=False).reset_index()

#inplace = True --> modify the dataframe; inplace = False --> creates a copy

Unnamed: 0,index,Origin,Age,Gender
0,0,Spain,23,male
1,3,Equador,25,male
2,2,Chile,25,female
3,1,Colombia,44,female
4,4,Spain,54,male


In [20]:
#Add a columns
import numpy as np
df2 = df.copy()

#A column with a list of values
df2['Height'] = np.array([1.71, 1.65, 1.70, 1.69, 1.75, 1.58])

#A column with default values
df2['Species'] = 'Human'
df2

Unnamed: 0,Origin,Age,Gender,Height,Species
0,Spain,23,male,1.71,Human
1,Colombia,44,female,1.65,Human
2,Chile,25,female,1.7,Human
3,Equador,25,male,1.69,Human
4,Spain,54,male,1.75,Human
5,Colombia,44,female,1.58,Human


In [21]:
#Remove 1 column
df2.drop('Species', axis='columns', inplace = True)
df2

Unnamed: 0,Origin,Age,Gender,Height
0,Spain,23,male,1.71
1,Colombia,44,female,1.65
2,Chile,25,female,1.7
3,Equador,25,male,1.69
4,Spain,54,male,1.75
5,Colombia,44,female,1.58


<p><span style="font-size: 14pt;">More Operations</span></p>

In [22]:
data = pd.DataFrame({'daily_rain': [10, 0, 12.4, 6.4, 0, 0, 1.1]})
data

Unnamed: 0,daily_rain
0,10.0
1,0.0
2,12.4
3,6.4
4,0.0
5,0.0
6,1.1


In [23]:
#Create a new column with the accummulated vales of rain
data['accummulated_rain']= data['daily_rain'].cumsum()
data

Unnamed: 0,daily_rain,accummulated_rain
0,10.0,10.0
1,0.0,10.0
2,12.4,22.4
3,6.4,28.8
4,0.0,28.8
5,0.0,28.8
6,1.1,29.9


In [24]:
#Operations among columns (+, -, *, /...)
data.daily_rain - data.accummulated_rain #Idem to data['daily_rain']-data['accummulated_rain']

0     0.0
1   -10.0
2   -10.0
3   -22.4
4   -28.8
5   -28.8
6   -28.8
dtype: float64

In [25]:
#Also numpy operations
np.sqrt(data.accummulated_rain)

0    3.162278
1    3.162278
2    4.732864
3    5.366563
4    5.366563
5    5.366563
6    5.468089
Name: accummulated_rain, dtype: float64

<p><span style="font-size: 14pt;">Load data from other sources</span></p>
<p><span style="font-size: 12pt;">In pandas we can load data from many sources. Typical are from databases and from text files. Let's explore this last one</span></p>

In [26]:
data = pd.read_csv('./Data/data_breast_cancer.csv')
print(f"tamaño de datos: {data.shape}\n")
print(data.info())
data.sample(6)


tamaño de datos: (569, 33)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
287,8913,B,12.89,13.12,81.89,515.9,0.06955,0.03729,0.0226,0.01171,...,15.54,87.4,577.0,0.09616,0.1147,0.1186,0.05366,0.2309,0.06915,
37,854941,B,13.03,18.42,82.61,523.8,0.08983,0.03766,0.02562,0.02923,...,22.81,84.46,545.9,0.09701,0.04619,0.04833,0.05013,0.1987,0.06169,
443,909777,B,10.57,18.32,66.82,340.9,0.08142,0.04462,0.01993,0.01111,...,23.31,69.35,366.3,0.09794,0.06542,0.03986,0.02222,0.2699,0.06736,
33,854002,M,19.27,26.47,127.9,1162.0,0.09401,0.1719,0.1657,0.07593,...,30.9,161.4,1813.0,0.1509,0.659,0.6091,0.1785,0.3672,0.1123,
323,895100,M,20.34,21.51,135.9,1264.0,0.117,0.1875,0.2565,0.1504,...,31.86,171.1,1938.0,0.1592,0.4492,0.5344,0.2685,0.5558,0.1024,
519,917080,B,12.75,16.7,82.51,493.8,0.1125,0.1117,0.0388,0.02995,...,21.74,93.63,624.1,0.1475,0.1979,0.1423,0.08045,0.3071,0.08557,
