#YoungDev Interns
Machine Learning Intermediate tasks

Task 1: Build a Model with Cross-Validation

Use k-fold cross-validation with scikit-learn to assess model performance.
Compare metrics such as accuracy or F1-score across folds.
Preprocess Data for Machine Learning


In [6]:
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target

In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score, f1_score

model = SVC(kernel='linear', random_state=42)

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'f1_macro': make_scorer(f1_score, average='macro')
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

accuracy_scores = cross_val_score(model, X_train_scaled, y_train, cv=cv, scoring='accuracy')
f1_scores = cross_val_score(model, X_train_scaled, y_train, cv=cv, scoring='f1_macro')

In [9]:
print(f"Accuracy across folds: {accuracy_scores}")
print(f"Mean accuracy: {accuracy_scores.mean():.3f} ± {accuracy_scores.std():.3f}")
print(f"\nF1-score (macro) across folds: {f1_scores}")
print(f"Mean F1-score: {f1_scores.mean():.3f} ± {f1_scores.std():.3f}")

Accuracy across folds: [0.95833333 0.95833333 0.95833333 1.         0.91666667]
Mean accuracy: 0.958 ± 0.026

F1-score (macro) across folds: [0.95816993 0.95816993 0.95816993 1.         0.91534392]
Mean F1-score: 0.958 ± 0.027


In [10]:
from sklearn.model_selection import cross_validate

cv_results = cross_validate(model, X_train_scaled, y_train, cv=cv,
                           scoring=['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'])


print("\nDetailed metrics per fold:")
for i in range(cv.get_n_splits()):
    print(f"\nFold {i+1}:")
    print(f"  Accuracy: {cv_results['test_accuracy'][i]:.3f}")
    print(f"  F1-score: {cv_results['test_f1_macro'][i]:.3f}")
    print(f"  Precision: {cv_results['test_precision_macro'][i]:.3f}")
    print(f"  Recall: {cv_results['test_recall_macro'][i]:.3f}")


print("\nOverall performance:")
for metric in ['accuracy', 'f1_macro', 'precision_macro', 'recall_macro']:
    scores = cv_results[f'test_{metric}']
    print(f"{metric}: {scores.mean():.3f} ± {scores.std():.3f}")


Detailed metrics per fold:

Fold 1:
  Accuracy: 0.958
  F1-score: 0.958
  Precision: 0.958
  Recall: 0.963

Fold 2:
  Accuracy: 0.958
  F1-score: 0.958
  Precision: 0.963
  Recall: 0.958

Fold 3:
  Accuracy: 0.958
  F1-score: 0.958
  Precision: 0.963
  Recall: 0.958

Fold 4:
  Accuracy: 1.000
  F1-score: 1.000
  Precision: 1.000
  Recall: 1.000

Fold 5:
  Accuracy: 0.917
  F1-score: 0.915
  Precision: 0.933
  Recall: 0.917

Overall performance:
accuracy: 0.958 ± 0.026
f1_macro: 0.958 ± 0.027
precision_macro: 0.964 ± 0.021
recall_macro: 0.959 ± 0.026


In [11]:
# Train on full training set
model.fit(X_train_scaled, y_train)

# Evaluate on test set
test_accuracy = model.score(X_test_scaled, y_test)
print(f"\nTest set accuracy: {test_accuracy:.3f}")


Test set accuracy: 0.967


Task 2: Handle missing values, normalize features, and encode categorical variables using scikit-learn.
Split data into training and testing sets.
Create a Classification Report (Using Wine Dataset)

In [12]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report



In [13]:
wine = load_wine()
X = pd.DataFrame(wine.data, columns=wine.feature_names)
y = wine.target


np.random.seed(42)
mask = np.random.rand(*X.shape) < 0.1
X[mask] = np.nan

X['alcohol_category'] = pd.cut(X['alcohol'], bins=3, labels=['low', 'medium', 'high'])
X.drop('alcohol', axis=1, inplace=True)

Preproocessing pipeline

In [14]:
numeric_features = [col for col in X.columns if col != 'alcohol_category']
categorical_features = ['alcohol_category']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

Model Pipeline

  

In [15]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create complete pipeline with classifier
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))])

Model results and classificiation report


In [16]:
# Train model
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Generate classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=wine.target_names))

# Additional metrics with cross-validation
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(f"\nCross-validated accuracy: {cv_scores.mean():.3f} ± {cv_scores.std():.3f}")


Classification Report:
              precision    recall  f1-score   support

     class_0       0.93      0.93      0.93        14
     class_1       0.93      0.93      0.93        14
     class_2       1.00      1.00      1.00         8

    accuracy                           0.94        36
   macro avg       0.95      0.95      0.95        36
weighted avg       0.94      0.94      0.94        36


Cross-validated accuracy: 0.955 ± 0.038


Task 3: Use scikit-learn to train a classification model.
Produce a report including precision, recall, and F1-score for each class. (Breast cancer dataset)

In [17]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score

cancer = load_breast_cancer()
X = pd.DataFrame(cancer.data, columns=cancer.feature_names)
y = cancer.target

print(f"Dataset shape: {X.shape}")
print(f"Feature names:\n{cancer.feature_names}")
print(f"Target classes: {cancer.target_names} (0: malignant, 1: benign)")
print(f"\nClass distribution:\n{pd.Series(y).value_counts()}")

Dataset shape: (569, 30)
Feature names:
['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
Target classes: ['malignant' 'benign'] (0: malignant, 1: benign)

Class distribution:
1    357
0    212
Name: count, dtype: int64


data preprocessing

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

pipeline = Pipeline([
    ('scaler', StandardScaler()),  #Standardize features
    ('classifier', RandomForestClassifier(
        n_estimators=100,
        max_depth=5,
        random_state=42,
        class_weight='balanced'  #class imbalance
    ))
])

Model training and evaluation

In [21]:
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1]

print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred, target_names=cancer.target_names))

print("\n=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))


cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='f1')
print(f"\nCross-validated F1 scores: {cv_scores}")
print(f"Mean F1: {np.mean(cv_scores):.3f} ± {np.std(cv_scores):.3f}")


=== Classification Report ===
              precision    recall  f1-score   support

   malignant       0.89      0.93      0.91        42
      benign       0.96      0.93      0.94        72

    accuracy                           0.93       114
   macro avg       0.92      0.93      0.93       114
weighted avg       0.93      0.93      0.93       114


=== Confusion Matrix ===
[[39  3]
 [ 5 67]]

Cross-validated F1 scores: [0.93617021 0.95890411 0.98611111 0.97222222 0.97142857]
Mean F1: 0.965 ± 0.017


Feature importance Analysis

In [22]:
importances = pipeline.named_steps['classifier'].feature_importances_
indices = np.argsort(importances)[::-1]

print("\n=== Feature Importances ===")
for i in range(10):
    print(f"{i+1}. {cancer.feature_names[indices[i]]}: {importances[indices[i]]:.4f}")


=== Feature Importances ===
1. worst area: 0.1540
2. worst concave points: 0.1270
3. mean concave points: 0.1006
4. worst radius: 0.0909
5. worst perimeter: 0.0763
6. mean concavity: 0.0743
7. mean perimeter: 0.0695
8. mean radius: 0.0443
9. area error: 0.0430
10. mean area: 0.0373
