# Section 2: Chargement et manipulation de données

In [None]:
import pandas as pd #big_kangourou

# df = pd.read_csv('./datasets/persons.csv')
df = pd.read_csv('https://raw.githubusercontent.com/tintamarre/python-introduction/main/datasets/persons.csv')
df.head()

In [None]:
df = pd.read_excel('https://raw.githubusercontent.com/tintamarre/python-introduction/main/datasets/persons.xlsx')
# df = pd.read_parquet('./datasets/persons.parquet')
df

In [None]:
print(df.shape)

In [None]:
df.columns

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.dtypes

In [None]:
# define the columns to be read as string
df = pd.read_csv('https://raw.githubusercontent.com/tintamarre/python-introduction/main/datasets/persons.csv', dtype={'id': str, 'name': str, 'age': str, 'city': str})

## Modification de données

In [None]:
# extract birthdate from the 8 first characters of the column national_number
df['birthdate'] = df['national_number'].str[:8]
df.head()

In [None]:
# convert the birthdate to datetime 
df['birthdate'] = pd.to_datetime(df['birthdate'], format='%y.%m.%d')
df.sample(5)

In [None]:
# if bithdate is in the future, remove 100 years
df.loc[df['birthdate'] > pd.to_datetime('today'), 'birthdate'] -= pd.DateOffset(years=100)
df.sample(5)

In [None]:
df.dtypes

In [None]:
# find the age of each person
df['age'] = (pd.to_datetime('today') - df['birthdate']).astype('<m8[Y]') # <m8[Y] is the code for years in datetime64
df.head()

In [None]:
df.dtypes

In [None]:
df['age'] = df['age'].astype(int)
df.head()

In [None]:
df.dtypes

In [None]:
# find gender from the national number : if 4th from the end is even, then it is a female else it should be a male
df['gender_digit'] = df['national_number'].str[-4].astype(int)
df.head()

In [None]:
def get_gender(fourth_digit_from_end):
    return 'Female' if fourth_digit_from_end % 2 == 0 else 'Male'

# if digit is even then it a female else it should be a male
df['gender'] = df['gender_digit'].apply(get_gender)
df.head() 

In [None]:
# add a age category column
df['age_category'] = pd.cut(
                    df['age'], 
                    bins=[0, 38, 42, 46, 113], 
                    labels=['0-38 Jeune', '39-42 Adulte', '43-46 À point', '47-113 Senior']
                    )

df.head()

In [None]:
# get company from domain name
df['company'] = df['email'].str.split('@').str[1].str.split('.').str[0]
df['company'] = df['company'].str.capitalize()
df['company'] = df['company'].str.replace('-', ' ')

df.head()

In [None]:
# remove phone
df = df.drop(columns=['phone', 'gender_digit', 'email'])

In [None]:
# create a full name column
df['full_name'] = df['first_name'] + ' ' + df['last_name']
df.drop(columns=['first_name', 'last_name'], inplace=True)
df.sample(5)

In [None]:
df.to_csv('./datasets/persons_cleaned.csv', index=False)

# Visualisation des données

In [None]:
# plot the age distribution
df['age'].plot(kind='hist', bins=20)

In [None]:
# plot the age category distribution ordered by age
df['age_category'].value_counts().sort_index().plot(kind='bar', color='green')

In [None]:
!pip install plotly

In [None]:
import plotly.express as px

#  plot the age category distribution ordered by age with plotly
fig = px.histogram(df,
                    x='age_category',
                    color="gender",
                    title='Age category distribution',
                    labels={'age_category': 'Age category','count': 'Number of persons'},
                    barmode='group',
                    category_orders={'age_category': df['age_category'].value_counts().sort_index().index}
                    )
fig.show()