In [1]:
import pandas as pd
import sys

In [2]:
# Load the dataset
try:
    df = pd.read_csv('Morocco_Student_Data_Pool.csv', low_memory=False)
    print("Dataset loaded successfully.")
except Exception as e:
    print(f"Error loading dataset: {e}")


Dataset loaded successfully.


In [3]:
# Basic Information
print("\n--- Basic Information ---")
print(f"Shape: {df.shape}")
print("\n--- Column Info ---")
print(df.info())


--- Basic Information ---
Shape: (10000, 286)

--- Column Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Columns: 286 entries, id_etudiant to remarques
dtypes: float64(65), int64(48), object(173)
memory usage: 21.8+ MB
None


In [4]:
# Missing Values
print("\n--- Missing Values ---")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])


--- Missing Values ---
etablissement_precedent     5925
type_maladie                2204
type_handicap              10000
economie_s1                10000
economie_s2                10000
economie_annuel            10000
comptabilite_s1            10000
comptabilite_s2            10000
comptabilite_annuel        10000
gestion_s1                 10000
gestion_s2                 10000
gestion_annuel             10000
annees_redoublees           4537
matieres_soutien            4813
type_sport                  1421
type_art                    2263
pays_cible                  8344
type_travail                2239
note_examen_regional       10000
note_examen_national       10000
note_controle_continu      10000
note_finale_bac            10000
mention_bac                10000
niveau_allemand             2953
date_mise_a_jour           10000
remarques                  10000
dtype: int64


In [5]:
# Duplicates
print("\n--- Duplicates ---")
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")


--- Duplicates ---
Number of duplicate rows: 0


In [6]:
# Unique values in categorical columns (sample)
print("\n--- Unique Values in Select Categorical Columns ---")
categorical_cols = ['sexe', 'region', 'type_etablissement', 'filiere']
for col in categorical_cols:
    if col in df.columns:
        print(f"\nUnique values in '{col}':")
        print(df[col].unique()[:10]) # Show first 10 unique values


--- Unique Values in Select Categorical Columns ---

Unique values in 'sexe':
['F' 'M']

Unique values in 'region':
['Fes-Meknes' 'Souss-Massa' 'Casablanca-Settat'
 'Tanger-Tetouan-Al Hoceima' 'Guelmim-Oued Noun' 'Beni Mellal-Khenifra'
 'Draa-Tafilalet' 'Dakhla-Oued Ed-Dahab' 'Laayoune-Sakia El Hamra'
 'Marrakech-Safi']

Unique values in 'type_etablissement':
['Lycee Qualifiant']

Unique values in 'filiere':
['Lettres et Sciences Humaines' 'Sciences et Technologies'
 'Sciences Mathematiques' 'Arts Appliques' 'Sciences Experimentales'
 'Sciences Economiques' 'Sciences']


In [7]:
# Summary Statistics for numerical columns
print("\n--- Summary Statistics ---")
print(df.describe())


--- Summary Statistics ---
                age   code_postal     telephone  annee_inscription  \
count  10000.000000  10000.000000  1.000000e+04            10000.0   
mean      18.090400  52052.000000  6.551848e+08             2024.0   
std        0.789614  24579.420473  2.597032e+07                0.0   
min       17.000000  10000.000000  6.100040e+08             2024.0   
25%       17.000000  31000.000000  6.327122e+08             2024.0   
50%       18.000000  52000.000000  6.552471e+08             2024.0   
75%       19.000000  73000.000000  6.776673e+08             2024.0   
max       19.000000  95000.000000  6.999856e+08             2024.0   

       revenu_mensuel_pere  revenu_mensuel_mere  telephone_tuteur  \
count          10000.00000         10000.000000      1.000000e+04   
mean            9009.11720          5823.895400      6.551801e+08   
std             5966.74195          4600.926752      2.598213e+07   
min                0.00000             0.000000      6.100040e+08