imports 

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer


Step 1: Data Attributes Identification & Data Collection

In [3]:
df = pd.read_csv('/Users/shashi/Desktop/ML/ ML_lab_me/student_dataset_copy.csv')
print("Data loaded successfully.")

Data loaded successfully.


Step 2: Data Pre-Processing

In [4]:
# Check for null values in the dataset
print(df.isnull().sum())


Student Name      0
Student ID        1
Year              2
IOT               0
CTSD              1
DS                1
OOPS              0
DTI               1
COA               1
DBMS              2
PFSD              0
MPC               0
ASE               1
OS                1
Specialization    1
dtype: int64


In [5]:

# Select only numeric columns
numeric_cols = df.select_dtypes(include=['number']).columns

# Fill null values with the mean for numerical columns only
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Check the results
print("\nNull values after filling with mean:")
print(df.isnull().sum())  # Should show no null values for numeric columns


Null values after filling with mean:
Student Name      0
Student ID        0
Year              0
IOT               0
CTSD              1
DS                1
OOPS              0
DTI               1
COA               0
DBMS              2
PFSD              0
MPC               0
ASE               1
OS                0
Specialization    1
dtype: int64


In [6]:
# numeric_cols = df.select_dtypes(include=['number']).columns
# df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
columns_to_convert = ['IOT', 'CTSD', 'DS', 'OOPS', 'DTI', 'COA', 'DBMS', 'PFSD', 'MPC', 'ASE', 'OS']
df[columns_to_convert] = df[columns_to_convert].apply(pd.to_numeric, errors='coerce')
df_cleaned = df.dropna()
print("Data preprocessing completed.")

Data preprocessing completed.


In [7]:
print(df_cleaned.head())    

  Student Name  Student ID  Year  IOT  CTSD    DS  OOPS  DTI   COA  DBMS  \
0    Student_1      1000.0   1.0  7.0   8.0   9.0   6.0  8.0   7.0   8.0   
1    Student_2      1001.0   2.0  6.0   8.0  10.0   6.0  9.0   7.0   7.0   
2    Student_3      1002.0   2.0  8.0  10.0   7.0   7.0  7.0   7.0   8.0   
3    Student_4      1003.0   1.0  9.0   6.0   6.0   8.0  9.0   7.0  10.0   
4    Student_5      1004.0   1.0  6.0   8.0   8.0  10.0  7.0  10.0   9.0   

   PFSD  MPC   ASE   OS  Specialization  
0   6.0    6   8.0  8.0          AI-IPA  
1   6.0   10   6.0  8.0           Cloud  
2   6.0   10  10.0  9.0  Cyber Security  
3   6.0    7   8.0  7.0  Cyber Security  
4  10.0    8   8.0  9.0           Cloud  


Step 3: Feature Engineering

In [8]:
X = df_cleaned.drop(columns=['Student Name', 'Student ID', 'Specialization'])
y = df_cleaned['Specialization']
print("Feature engineering completed.")

Feature engineering completed.



 Step 4: Training and Testing Data Split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Data split into training and testing sets.")

Data split into training and testing sets.


In [10]:
print(f'Train set size: {X_train.shape[0]}')
print(f'Test set size: {X_test.shape[0]}')

Train set size: 39982
Test set size: 9996


In [11]:
print("\nTraining Features (X_train):")
print(X_train)
print("\nTraining Target (y_train):")
print(y_train)
print("\nTesting Features (X_test):")
print(X_test)
print("\nTesting Target (y_test):")
print(y_test)


Training Features (X_train):
       Year   IOT  CTSD    DS  OOPS   DTI   COA  DBMS  PFSD  MPC   ASE    OS
47288   2.0   8.0   6.0   9.0   8.0   8.0   6.0   7.0   9.0    8   8.0  10.0
30262   1.0  10.0   8.0  10.0   8.0   8.0   6.0   9.0   6.0    6   6.0   6.0
26514   1.0   6.0   6.0   9.0   6.0   8.0   7.0   9.0   9.0   10   8.0  10.0
18137   2.0   8.0   7.0   7.0   8.0   7.0   8.0   9.0   8.0    6  10.0   7.0
24082   2.0   9.0   9.0   9.0   8.0   9.0  10.0   7.0   9.0    7   9.0   9.0
...     ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   ...   ...
11306   1.0   6.0   9.0   8.0   8.0  10.0   6.0   9.0   9.0    8   6.0   9.0
44754   2.0   6.0   6.0   6.0   7.0   6.0   6.0   6.0  10.0   10   6.0   7.0
38180   2.0  10.0   9.0   8.0   9.0   6.0   6.0   6.0   8.0   10   7.0   7.0
873     2.0   9.0  10.0   8.0   7.0   8.0   9.0   8.0   6.0    6   6.0   7.0
15817   1.0  10.0   7.0   9.0  10.0   9.0   7.0   6.0  10.0    9   7.0   7.0

[39982 rows x 12 columns]

Training Target (y

Step 5: Choose Appropriate ML Algorithms

In [12]:
models = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier()
}
print("Models selected.")

Models selected.



Step 6: Build ML Model with Training Data

In [13]:
trained_models = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    trained_models[name] = model
    print(f"{name} model trained successfully.")

Decision Tree model trained successfully.
Random Forest model trained successfully.
K-Nearest Neighbors model trained successfully.


Step 7: Evaluate Model with Testing Data

In [14]:
for name, model in trained_models.items():
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.2f}")

Decision Tree Accuracy: 0.33
Random Forest Accuracy: 0.33
K-Nearest Neighbors Accuracy: 0.34


Step 8: Compute Performance Metrics

In [15]:
for name, model in trained_models.items():
    y_pred = model.predict(X_test)
    print(f"Performance metrics for {name}:\n")
    print(classification_report(y_test, y_pred))

Performance metrics for Decision Tree:

                precision    recall  f1-score   support

        AI-IPA       0.32      0.33      0.32      3258
         Cloud       0.34      0.33      0.33      3366
Cyber Security       0.33      0.32      0.33      3372

      accuracy                           0.33      9996
     macro avg       0.33      0.33      0.33      9996
  weighted avg       0.33      0.33      0.33      9996

Performance metrics for Random Forest:

                precision    recall  f1-score   support

        AI-IPA       0.33      0.35      0.34      3258
         Cloud       0.33      0.33      0.33      3366
Cyber Security       0.34      0.33      0.33      3372

      accuracy                           0.33      9996
     macro avg       0.33      0.33      0.33      9996
  weighted avg       0.33      0.33      0.33      9996

Performance metrics for K-Nearest Neighbors:

                precision    recall  f1-score   support

        AI-IPA       0.34  

Step 9: Perform Hyperparameter Tuning (example with Random Forest)

In [16]:
param_grid = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20, 30]
}
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_rf_model = grid_search.best_estimator_
print("Best hyperparameters for Random Forest:", grid_search.best_params_)


Best hyperparameters for Random Forest: {'max_depth': 10, 'n_estimators': 50}


Step 10: Iterate through steps 3-9 if needed (assume we are satisfied with tuning)

Step 11: Choose the Model with Optimum Performance

In [18]:
best_model = best_rf_model
print("Best model selected: Random Forest with hyperparameter tuning.")

Best model selected: Random Forest with hyperparameter tuning.


Define prediction function

In [19]:
# # Define prediction function
# def predict_specialization(student_name, student_id, year, iot, ctsd, ds, oops, dti, coa, dbms, pfsd, mpc, ase, os):
#     # Prepare input data in the correct format
#     input_data = pd.DataFrame({
#         'Year': [year],
#         'IOT': [iot],
#         'CTSD': [ctsd],
#         'DS': [ds],
#         'OOPS': [oops],
#         'DTI': [dti],
#         'COA': [coa],
#         'DBMS': [dbms],
#         'PFSD': [pfsd],
#         'MPC': [mpc],
#         'ASE': [ase],
#         'OS': [os]
#     })

#     # Impute any missing values in the input data
#     imputer = SimpleImputer(strategy="mean")  
#     input_data = pd.DataFrame(imputer.fit_transform(input_data), columns=input_data.columns)

#     # Predict specialization
#     prediction = best_model.predict(input_data)
#     return prediction[0]

# # Example usage
# predicted_specialization = predict_specialization("shashi", 1, 10, 4, 9, 9, 6, 7, 5, 6, 9, 7, 5, 9)
# print(f"Predicted Specialization: {predicted_specialization}")


In [20]:
import pandas as pd
from sklearn.impute import SimpleImputer

# Define prediction function
def predict_specialization(student_name, student_id, year, iot, ctsd, ds, oops, dti, coa, dbms, pfsd, mpc, ase, os):
    # Prepare input data in the correct format
    input_data = pd.DataFrame({
        'Year': [year],
        'IOT': [iot],
        'CTSD': [ctsd],
        'DS': [ds],
        'OOPS': [oops],
        'DTI': [dti],
        'COA': [coa],
        'DBMS': [dbms],
        'PFSD': [pfsd],
        'MPC': [mpc],
        'ASE': [ase],
        'OS': [os]
    })

    # Impute any missing values in the input data
    imputer = SimpleImputer(strategy="mean")
    input_data = pd.DataFrame(imputer.fit_transform(input_data), columns=input_data.columns)

    # Predict specialization
    prediction = best_model.predict(input_data)
    return prediction[0]

# Collect inputs from the user
student_name = input("Enter Student Name: ")
student_id = int(input("Enter Student ID: "))
year = int(input("Enter Year: "))
iot = float(input("Enter IOT score: "))
ctsd = float(input("Enter CTSD score: "))
ds = float(input("Enter DS score: "))
oops = float(input("Enter OOPS score: "))
dti = float(input("Enter DTI score: "))
coa = float(input("Enter COA score: "))
dbms = float(input("Enter DBMS score: "))
pfsd = float(input("Enter PFSD score: "))
mpc = float(input("Enter MPC score: "))
ase = float(input("Enter ASE score: "))
os = float(input("Enter OS score: "))

# Make prediction
predicted_specialization = predict_specialization(student_name, student_id, year, iot, ctsd, ds, oops, dti, coa, dbms, pfsd, mpc, ase, os)
print(f"Student {student_name} (ID: {student_id}) is predicted to specialize in {predicted_specialization}.")



ValueError: could not convert string to float: ''