In [2]:
# ===============================
# QUESTION 3: Random Forest on Custom CSV Dataset
# Predict whether a student will Pass or Fail based on study_hours, attendance, and marks
# ===============================

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# ---------------------------------------------------
# Step 1: Create and save the dataset as a CSV file
# ---------------------------------------------------
data = {
    'study_hours': [2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8],
    'attendance': [60, 70, 80, 90, 85, 95, 75, 100, 65, 78, 88, 92, 96, 85],
    'marks': [50, 55, 60, 70, 75, 80, 65, 90, 52, 58, 67, 72, 81, 77],
    'result': ['Fail', 'Fail', 'Fail', 'Pass', 'Pass', 'Pass', 'Pass', 'Pass',
               'Fail', 'Fail', 'Pass', 'Pass', 'Pass', 'Pass']
}

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV file
df.to_csv('students.csv', index=False)
print("âœ… students.csv file created successfully!\n")

# ---------------------------------------------------
# Step 2: Load the CSV dataset
# ---------------------------------------------------
df = pd.read_csv('students.csv')
print("Dataset preview:\n", df.head(), "\n")

# ---------------------------------------------------
# Step 3: Split features and target variable
# ---------------------------------------------------
X = df[['study_hours', 'attendance', 'marks']]
y = df['result']

# ---------------------------------------------------
# Step 4: Train-test split (70% training, 30% testing)
# ---------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# ---------------------------------------------------
# Step 5: Train the Random Forest Classifier
# ---------------------------------------------------
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# ---------------------------------------------------
# Step 6: Predict and evaluate the model
# ---------------------------------------------------
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("ðŸŽ¯ Model Evaluation Results:")
print(f"Accuracy: {accuracy:.2f}")
print("\nFeature Importances:")
print(pd.Series(rf.feature_importances_, index=X.columns))

# ---------------------------------------------------
# Step 7 (Optional): Predict on new sample data
# ---------------------------------------------------
sample = pd.DataFrame({
    'study_hours': [5, 8],
    'attendance': [88, 95],
    'marks': [68, 85]
})

predictions = rf.predict(sample)
print("\nPredictions for new students:")
print(sample.assign(predicted_result=predictions))


âœ… students.csv file created successfully!

Dataset preview:
    study_hours  attendance  marks result
0            2          60     50   Fail
1            3          70     55   Fail
2            4          80     60   Fail
3            5          90     70   Pass
4            6          85     75   Pass 

ðŸŽ¯ Model Evaluation Results:
Accuracy: 1.00

Feature Importances:
study_hours    0.388377
attendance     0.314859
marks          0.296764
dtype: float64

Predictions for new students:
   study_hours  attendance  marks predicted_result
0            5          88     68             Pass
1            8          95     85             Pass
