In [6]:
import numpy as np
import pandas as pd

# Charger les données
data = np.genfromtxt("household_power_consumption.txt", delimiter=";", dtype=object)
df = pd.DataFrame(data[1:], columns=data[0])
df.columns = df.columns.str.decode('utf-8')
df = df.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

# Supprimer les colonnes non nécessaires
df = df.drop(['Time', 'Sub_metering_2', 'Sub_metering_3'], axis=1)

# Définir 'Date' comme index
df = df.set_index('Date')

# Fonction pour mettre à jour les types de données
def update_types(df):
    df.index = pd.to_datetime(df.index, format='%d/%m/%Y')
    df['Global_active_power'] = pd.to_numeric(df['Global_active_power'], errors='coerce')
    df['Global_reactive_power'] = pd.to_numeric(df['Global_reactive_power'], errors='coerce')
    df['Voltage'] = pd.to_numeric(df['Voltage'], errors='coerce')
    df['Global_intensity'] = pd.to_numeric(df['Global_intensity'], errors='coerce')
    df['Sub_metering_1'] = pd.to_numeric(df['Sub_metering_1'], errors='coerce', downcast='float')
    return df

df = update_types(df)

# Afficher les types de données
print(df.dtypes)

# Afficher les statistiques descriptives
print(df.describe())

# Supprimer les lignes avec des valeurs manquantes
df = df.dropna()
print(df.isna().sum())

# Modifier Sub_metering_1
df.loc[:,'Sub_metering_1'] = (df['Sub_metering_1'] + 1) * 0.06

# Filtrer les données
filtered_df = df[(df.index >= '2008-12-27') & (df['Voltage'] >= 242)]
print(filtered_df)

# Afficher la 88888ème ligne
print(df.iloc[88888])

# Trouver la date avec la valeur maximale de Global_active_power
max_power = df['Global_active_power'].max()
max_power_date = df[df['Global_active_power'] == max_power].index
print(max_power_date)

# Trier les données
df3 = df.columns[:3]
sorted_df = df[df3].sort_values(by=['Global_active_power', 'Voltage'], ascending=[False, True])
print(sorted_df.tail())

# Calculer la moyenne journalière de Global_active_power
daily_avg = df.groupby(df.index.date)['Global_active_power'].mean()
print(daily_avg)

  df = df.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)


Global_active_power      float64
Global_reactive_power    float64
Voltage                  float64
Global_intensity         float64
Sub_metering_1           float32
dtype: object
       Global_active_power  Global_reactive_power       Voltage  \
count         2.049280e+06           2.049280e+06  2.049280e+06   
mean          1.091615e+00           1.237145e-01  2.408399e+02   
std           1.057294e+00           1.127220e-01  3.239987e+00   
min           7.600000e-02           0.000000e+00  2.232000e+02   
25%           3.080000e-01           4.800000e-02  2.389900e+02   
50%           6.020000e-01           1.000000e-01  2.410100e+02   
75%           1.528000e+00           1.940000e-01  2.428900e+02   
max           1.112200e+01           1.390000e+00  2.541500e+02   

       Global_intensity  Sub_metering_1  
count      2.049280e+06    2.049280e+06  
mean       4.627759e+00    1.121923e+00  
std        4.444396e+00    6.153031e+00  
min        2.000000e-01    0.000000e+00  
25%    