In [19]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

# File paths
file_path = 'C:\\Users\\user\\Desktop\\playground-series\\'

# Load the datasets
train_df = pd.read_csv(file_path + 'train.csv')
test_df = pd.read_csv(file_path + 'test.csv')
sample_submission = pd.read_csv(file_path + 'sample_submission.csv')


In [20]:
# Display the first few rows of the training dataset
train_df.head()

Unnamed: 0,id,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,0,1,1,1,9238,1,1,126.0,1,1,...,0,6,7,6,12.428571,0,11.1,0.6,2.02,Graduate
1,1,1,17,1,9238,1,1,125.0,1,19,...,0,6,9,0,0.0,0,11.1,0.6,2.02,Dropout
2,2,1,17,2,9254,1,1,137.0,1,3,...,0,6,0,0,0.0,0,16.2,0.3,-0.92,Dropout
3,3,1,1,3,9500,1,1,131.0,1,19,...,0,8,11,7,12.82,0,11.1,0.6,2.02,Enrolled
4,4,1,1,2,9500,1,1,132.0,1,19,...,0,7,12,6,12.933333,0,7.6,2.6,0.32,Graduate


In [21]:
# Check for missing values
print(train_df.isnull().sum())

id                                                0
Marital status                                    0
Application mode                                  0
Application order                                 0
Course                                            0
Daytime/evening attendance                        0
Previous qualification                            0
Previous qualification (grade)                    0
Nacionality                                       0
Mother's qualification                            0
Father's qualification                            0
Mother's occupation                               0
Father's occupation                               0
Admission grade                                   0
Displaced                                         0
Educational special needs                         0
Debtor                                            0
Tuition fees up to date                           0
Gender                                            0
Scholarship 

In [22]:
# Basic Data Exploration
train_df.describe()
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76518 entries, 0 to 76517
Data columns (total 38 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   id                                              76518 non-null  int64  
 1   Marital status                                  76518 non-null  int64  
 2   Application mode                                76518 non-null  int64  
 3   Application order                               76518 non-null  int64  
 4   Course                                          76518 non-null  int64  
 5   Daytime/evening attendance                      76518 non-null  int64  
 6   Previous qualification                          76518 non-null  int64  
 7   Previous qualification (grade)                  76518 non-null  float64
 8   Nacionality                                     76518 non-null  int64  
 9   Mother's qualification                 

In [23]:
# Encode categorical features
label_encoders = {}
for column in train_df.select_dtypes(include=['object']).columns:
    if column != 'Target':
        label_encoders[column] = LabelEncoder()
        train_df[column] = label_encoders[column].fit_transform(train_df[column])
        test_df[column] = label_encoders[column].transform(test_df[column])

# Encode the target variable
target_encoder = LabelEncoder()
train_df['Target'] = target_encoder.fit_transform(train_df['Target'])

# Split the training data into features and target
X = train_df.drop(columns=['id', 'Target'])
y = train_df['Target']



In [24]:
# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)
test_X = test_df.drop(columns=['id'])
test_X = scaler.transform(test_X)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
# Initialize and train the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the validation set
val_predictions = model.predict(X_val)
val_accuracy = accuracy_score(y_val, val_predictions)
print(f'Validation Accuracy: {val_accuracy:.4f}')

# Make predictions on the test set
test_predictions = model.predict(test_X)

# Convert numerical predictions back to original labels
test_predictions = target_encoder.inverse_transform(test_predictions)

Validation Accuracy: 0.8267


In [26]:
# Create the submission file
submission1_df = pd.DataFrame({'id': test_df['id'], 'Target': test_predictions})
submission1_df.to_csv('submission1.csv', index=False)
print('Submission file created successfully!')

Submission file created successfully!
