In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

# Chargement
df = pd.read_csv("PCOS_DATA.csv")
df.columns = df.columns.str.strip()

# Renommage correct des colonnes
df.rename(columns={
    'Age (in Years)': 'Age',
    'Weight (in Kg)': 'Weight',
    'Height (in Cm / Feet)': 'Height',
    'Can you tell us your blood group ?': 'BloodGroup',
    'After how many months do you get your periods?\n(select 1- if every month/regular)': 'PeriodGap',
    'Have you gained weight recently?': 'WeightGain',
    'Do you have excessive body/facial hair growth ?': 'HairGrowth',
    'Are you noticing skin darkening recently?': 'SkinDarkening',
    'Do have hair loss/hair thinning/baldness ?': 'HairLoss',
    'Do you have pimples/acne on your face/jawline ?': 'Pimples',
    'Do you eat fast food regularly ?': 'FastFood',
    'Do you exercise on a regular basis ?': 'Exercise',
    'Have you been diagnosed with PCOS/PCOD?': 'PCOS',
    'Do you experience mood swings ?': 'MoodSwings',
    'Are your periods regular ?': 'RegularPeriods',
    'How long does your period last ? (in Days)\nexample- 1,2,3,4.....': 'PeriodDuration'
}, inplace=True)

# Colonnes booléennes
bool_columns = ['WeightGain', 'HairGrowth', 'SkinDarkening', 'HairLoss', 'Pimples',
                'FastFood', 'Exercise', 'MoodSwings', 'RegularPeriods']

# Caractéristiques utilisées
features = ['Age', 'Weight', 'Height', 'PeriodGap', 'PeriodDuration'] + bool_columns

# Nettoyage
df = df[features + ['PCOS']].dropna()

# Conversion de la cible en 0 ou 1
df['PCOS'] = df['PCOS'].apply(lambda x: 1 if str(x).lower() in ['yes', '1', 'true'] else 0)

# Séparation X/y
X = df[features]
y = df['PCOS']

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Modèle Random Forest
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Évaluation
accuracy = accuracy_score(y_test, model.predict(X_test))
print(f"Accuracy: {accuracy * 100:.2f}%")

# Sauvegarde
joblib.dump(model, "model_pcos.pkl")


Accuracy: 92.47%


['model_pcos.pkl']

In [None]:
df.head()
