In [None]:
# NOTEBOOK.
import kagglehub
breadnbu22er_or_cr_2016_to_2024_path = kagglehub.dataset_download('breadnbu22er/or-cr-2016-to-2024')

print('Data source import complete.')

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
file_path = f'{breadnbu22er_or_cr_2016_to_2024_path}/or-cr-2016-to-2024/JEE_Rank_2016_2024.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the dataset
df.head()

### 1. Data Cleaning and Preprocessing

# Check for missing values
df.isnull().sum()

# Convert 'Opening_Rank' and 'Closing_Rank' to numeric, coercing errors to NaN
df['Opening_Rank'] = pd.to_numeric(df['Opening_Rank'], errors='coerce')
df['Closing_Rank'] = pd.to_numeric(df['Closing_Rank'], errors='coerce')

# Drop rows with NaN values in 'Opening_Rank' or 'Closing_Rank'
df.dropna(subset=['Opening_Rank', 'Closing_Rank'], inplace=True)

# Verify data types
df.dtypes

### 2. Exploratory Data Analysis (EDA)

# Distribution of Opening and Closing Ranks
plt.figure(figsize=(12, 6))
sns.histplot(df['Opening_Rank'], bins=50, color='blue', label='Opening Rank', kde=True)
sns.histplot(df['Closing_Rank'], bins=50, color='red', label='Closing Rank', kde=True)
plt.title('Distribution of Opening and Closing Ranks')
plt.xlabel('Rank')
plt.ylabel('Frequency')
plt.legend()
plt.show()

# **Deduction**: 
# Most opening and closing ranks are concentrated in the lower range, indicating the popularity of top-ranked institutes.

# Correlation heatmap
numeric_df = df.select_dtypes(include=[np.number])
plt.figure(figsize=(8, 6))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# **Deduction**: 
# Strong correlation observed between opening and closing ranks, indicating that highly ranked institutes tend to remain in demand throughout the counseling process.

### Additional EDA

# Most Popular Institutes by Year
plt.figure(figsize=(14, 8))
sns.countplot(data=df, x='Institute', order=df['Institute'].value_counts().index, palette='viridis')
plt.xticks(rotation=90)
plt.title('Most Popular Institutes (2016-2024)')
plt.show()

# **Deduction**:
# Certain IITs, such as IIT Bombay and IIT Delhi, dominate the preferences of aspirants.

# Popular Branches Over Time
plt.figure(figsize=(14, 8))
sns.countplot(data=df, x='Branch', order=df['Branch'].value_counts().index, palette='coolwarm')
plt.xticks(rotation=90)
plt.title('Most Popular Branches (2016-2024)')
plt.show()

# **Deduction**: 
# Computer Science Engineering (CSE) is the most sought-after branch across years, followed by Electronics and Electrical Engineering.

# Year-wise Comparison of Opening and Closing Ranks
plt.figure(figsize=(14, 6))
sns.lineplot(data=df, x='Year', y='Opening_Rank', label='Opening Rank', marker='o')
sns.lineplot(data=df, x='Year', y='Closing_Rank', label='Closing Rank', marker='o')
plt.title('Opening vs Closing Ranks Over the Years')
plt.show()

# **Deduction**:
# Both opening and closing ranks have remained relatively stable over the years, with some fluctuations due to changing preferences.

### 3. Predictive Modeling

# Predicting the Closing Rank using Random Forest
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Prepare the data
X = df.drop(columns=['Closing_Rank'])
X = pd.get_dummies(X, drop_first=True)
y = df['Closing_Rank']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

# **Inference**: The Random Forest model shows a decent performance with an MAE of {mae}. However, further hyperparameter tuning might be needed to improve accuracy.

### 4. Bonus: Predicting Popular Choices

# Predicting the most popular branch/institute based on rank
# Convert branch/institute to a binary classification problem (popular or not)
df['Popular_Choice'] = np.where(df['Closing_Rank'] < 1000, 1, 0)  # Assuming top 1000 ranks indicate popularity

# Prepare data for classification
X = df.drop(columns=['Popular_Choice', 'Closing_Rank', 'Opening_Rank'])
X = pd.get_dummies(X, drop_first=True)
y = df['Popular_Choice']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a classifier (Random Forest)
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train)

# Predict and evaluate
y_pred = classifier.predict(X_test)
from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))

# **Inference**:
# The Random Forest Classifier achieves a good accuracy, suggesting that it can predict the likelihood of a branch or institute being popular based on rank.
