In [1]:
import pandas as pd
import numpy as np

# 1. Load and prepare the dataset

df = pd.read_csv('D:/inductive_deductive/backend/auto_labeled_responses.csv')
df.head()


Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11,Q12,Q13,Q14,inductive_score,deductive_score,label
0,4,3,1,4,1,4,2,4,3,2,3,4,2,2,15,24,Deductive
1,3,1,2,1,4,1,2,4,1,4,3,1,2,3,18,14,Inductive
2,4,1,4,1,2,3,3,3,2,1,1,4,2,1,18,14,Inductive
3,2,2,3,3,3,1,3,3,2,3,3,1,4,3,20,16,Inductive
4,2,3,4,1,1,4,1,2,2,1,1,1,3,1,14,13,Inductive


In [5]:
df.describe()

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11,Q12,Q13,Q14,inductive_score,deductive_score
count,169.0,169.0,169.0,169.0,169.0,169.0,169.0,169.0,169.0,169.0,169.0,169.0,169.0,169.0,169.0,169.0
mean,2.491124,2.680473,2.704142,2.378698,2.692308,2.366864,2.727811,2.757396,2.573964,2.64497,2.633136,2.715976,2.715976,2.526627,18.550296,18.059172
std,1.091735,1.141077,1.162937,1.068386,1.164965,1.116048,1.116647,1.141879,1.110892,1.140954,1.055751,1.134856,1.097527,1.12369,4.32863,4.171479
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0
25%,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,16.0,16.0
50%,2.0,3.0,3.0,2.0,3.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,18.0,19.0
75%,3.0,4.0,4.0,3.0,4.0,3.0,4.0,4.0,3.0,4.0,3.0,4.0,4.0,3.0,22.0,21.0
max,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,28.0,28.0


In [6]:
df.isnull().sum().sum()

np.int64(0)

In [7]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169 entries, 0 to 168
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Q1               169 non-null    int64 
 1   Q2               169 non-null    int64 
 2   Q3               169 non-null    int64 
 3   Q4               169 non-null    int64 
 4   Q5               169 non-null    int64 
 5   Q6               169 non-null    int64 
 6   Q7               169 non-null    int64 
 7   Q8               169 non-null    int64 
 8   Q9               169 non-null    int64 
 9   Q10              169 non-null    int64 
 10  Q11              169 non-null    int64 
 11  Q12              169 non-null    int64 
 12  Q13              169 non-null    int64 
 13  Q14              169 non-null    int64 
 14  inductive_score  169 non-null    int64 
 15  deductive_score  169 non-null    int64 
 16  label            169 non-null    object
dtypes: int64(16), object(1)
memory usag

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE

In [9]:
X = df.drop(columns=['label']) 
y = df['label'] 

# Encode target labels
target_encoder = LabelEncoder()
y = target_encoder.fit_transform(y)

# Train-test split 
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [10]:
# 2. Preprocessing: Imputation + Scaling
preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)


In [11]:
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

In [12]:
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(random_state=42,probability=True),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'KNN': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB()
}

In [13]:
model_scores = {}
for name, model in models.items():
    print(f"🔹 {name}")
    
    # Train
    model.fit(X_train_res, y_train_res)
    
    # Test Accuracy
    y_pred = model.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred)
    
    model_scores[name] = test_acc
    
    print(f"Test Accuracy      : {test_acc:.4f}\n")


🔹 Random Forest
Test Accuracy      : 0.8529

🔹 SVM
Test Accuracy      : 0.9412

🔹 Decision Tree
Test Accuracy      : 0.8529

🔹 KNN
Test Accuracy      : 0.8235

🔹 Naive Bayes
Test Accuracy      : 0.9118



In [19]:
# 6. Identify the best model
best_model_name = 'SVM'
best_model = models[best_model_name]
best_accuracy = model_scores[best_model_name]


In [20]:
import pickle
MODEL_PATH = 'model.pkl'
PREPROCESSOR_PATH = 'preprocessor.pkl'
ENCODER_PATH = 'encoder.pkl'

with open(MODEL_PATH, 'wb') as model_file:
    pickle.dump(best_model, model_file)

with open(PREPROCESSOR_PATH, 'wb') as scaler_file:
    pickle.dump(preprocessor, scaler_file)

with open(ENCODER_PATH, 'wb') as encoder_file:
    pickle.dump(target_encoder, encoder_file)

In [21]:

print("\n--- Summary ---")
for model_name, test_acc in model_scores.items():
    print(f"{model_name:20} | Test: {test_acc:.4f}")

print(f"\nBest model: {best_model_name} with accuracy: {best_accuracy:.4f}")


--- Summary ---
Random Forest        | Test: 0.8529
SVM                  | Test: 0.9412
Decision Tree        | Test: 0.8529
KNN                  | Test: 0.8235
Naive Bayes          | Test: 0.9118

Best model: SVM with accuracy: 0.9412
