In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


In [3]:
def load_dataset(file_path):
    try:
        df = pd.read_csv(file_path)
        print("‚úî Dataset charg√© avec succ√®s :", file_path)
        print(df.head())
        return df, file_path
    except Exception as e:
        raise Exception(f"Erreur lors du chargement du dataset: {e}")


In [4]:
def data_understanding(df):
    print("\nüîπ DIMENSIONS :", df.shape)
    print("\nüîπ TYPES :", df.dtypes)
    print("\nüîπ VALEURS MANQUANTES :", df.isnull().sum())
    print("\nüîπ STATISTIQUES :", df.describe())


In [5]:
def plot_correlations(df):
    plt.figure(figsize=(10,6))
    sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
    plt.title("Matrice des corr√©lations")
    plt.tight_layout()
    plt.savefig("correlation_matrix.png")
    print("‚úî Matrice de corr√©lation sauvegard√©e ‚Üí correlation_matrix.png")


In [13]:
def detect_outliers(df):
    print("\nüîç D√©tection d'outliers (IQR) ...")
    outliers_count = {}

    for col in df.select_dtypes(include=[np.number]).columns:

        Q1 = df[col].quantile(0.25)   # correction
        Q3 = df[col].quantile(0.75)   # correction
        IQR = Q3 - Q1

        outliers = df[(df[col] < Q1 - 1.5*IQR) | (df[col] > Q3 + 1.5*IQR)]
        outliers_count[col] = len(outliers)

    print("\nüìå Nombre d'outliers par feature :")
    for col, n in outliers_count.items():
        print(f"{col} : {n}")


In [14]:
def feature_engineering(df):
    df_copy = df.copy()
    scaler = StandardScaler()
    num_cols = df_copy.select_dtypes(include=[np.number]).columns
    df_copy[num_cols] = scaler.fit_transform(df_copy[num_cols])

    print("‚úî Feature engineering appliqu√© (StandardScaler)")
    return df_copy


In [15]:
def pca_analysis(df):
    num_df = df.select_dtypes(include=[np.number])

    pca = PCA(n_components=2)
    components = pca.fit_transform(num_df)

    plt.figure(figsize=(7,5))
    plt.scatter(components[:,0], components[:,1], s=10, alpha=0.7)
    plt.title("Analyse PCA (2 composantes)")
    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.tight_layout()
    plt.savefig("pca_plot.png")
    print("‚úî PCA sauvegard√©e ‚Üí pca_plot.png")

    print("\nüéØ Variance expliqu√©e :", pca.explained_variance_ratio_)



In [16]:
def main():
    try:
        df, file_path = load_dataset("Concrete_Data.csv")

        data_understanding(df)
        plot_correlations(df)
        detect_outliers(df)
        df_engineered = feature_engineering(df)
        pca_analysis(df_engineered)

        print("\n" + "="*60)
        print("üéâ ANALYSE TERMIN√âE AVEC SUCC√àS !")
        print("="*60)

        output_path = file_path.replace(".csv", "_transformed.csv")
        df_engineered.to_csv(output_path, index=False)
        print(f"‚úî Dataset transform√© sauvegard√© ‚Üí {output_path}")

    except Exception as e:
        print(f"‚ùå Une erreur est survenue: {str(e)}")


In [17]:
if __name__ == "__main__":
    main()


‚úî Dataset charg√© avec succ√®s : Concrete_Data.csv
   Cement (component 1)(kg in a m^3 mixture)  \
0                                      540.0   
1                                      540.0   
2                                      332.5   
3                                      332.5   
4                                      198.6   

   Blast Furnace Slag (component 2)(kg in a m^3 mixture)  \
0                                                0.0       
1                                                0.0       
2                                              142.5       
3                                              142.5       
4                                              132.4       

   Fly Ash (component 3)(kg in a m^3 mixture)  \
0                                         0.0   
1                                         0.0   
2                                         0.0   
3                                         0.0   
4                                         0.0   

  