In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix


In [7]:
file_path = 'D:/miniconda python/data sets/MBA.csv'
data = pd.read_csv(file_path)

In [8]:
print("First few rows of the dataset:")
print(data.head())

First few rows of the dataset:
   application_id  gender  international   gpa       major      race   gmat  \
0               1  Female          False  3.30    Business     Asian  620.0   
1               2    Male          False  3.28  Humanities     Black  680.0   
2               3  Female           True  3.30    Business       NaN  710.0   
3               4    Male          False  3.47        STEM     Black  690.0   
4               5    Male          False  3.35        STEM  Hispanic  590.0   

   work_exp          work_industry admission  
0       3.0     Financial Services     Admit  
1       5.0  Investment Management       NaN  
2       5.0             Technology     Admit  
3       6.0             Technology       NaN  
4       5.0             Consulting       NaN  


In [9]:
print("\nDataset Information:")
data.info()


Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6194 entries, 0 to 6193
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   application_id  6194 non-null   int64  
 1   gender          6194 non-null   object 
 2   international   6194 non-null   bool   
 3   gpa             6194 non-null   float64
 4   major           6194 non-null   object 
 5   race            4352 non-null   object 
 6   gmat            6194 non-null   float64
 7   work_exp        6194 non-null   float64
 8   work_industry   6194 non-null   object 
 9   admission       1000 non-null   object 
dtypes: bool(1), float64(3), int64(1), object(5)
memory usage: 441.7+ KB


In [10]:
print("\nMissing Values in Each Column:")
missing_values = data.isnull().sum()
print(missing_values)


Missing Values in Each Column:
application_id       0
gender               0
international        0
gpa                  0
major                0
race              1842
gmat                 0
work_exp             0
work_industry        0
admission         5194
dtype: int64


In [11]:
print("\nDescriptive Statistics for Numerical Features:")
print(data.describe())



Descriptive Statistics for Numerical Features:
       application_id          gpa         gmat     work_exp
count     6194.000000  6194.000000  6194.000000  6194.000000
mean      3097.500000     3.250714   651.092993     5.016952
std       1788.198115     0.151541    49.294883     1.032432
min          1.000000     2.650000   570.000000     1.000000
25%       1549.250000     3.150000   610.000000     4.000000
50%       3097.500000     3.250000   650.000000     5.000000
75%       4645.750000     3.350000   680.000000     6.000000
max       6194.000000     3.770000   780.000000     9.000000


In [12]:
categorical_features = ['gender', 'major', 'race', 'work_industry', 'admission']
print("\nDistribution of Categorical Features:")
for feature in categorical_features:
    print(f"\n{feature} Distribution:")
    print(data[feature].value_counts())


Distribution of Categorical Features:

gender Distribution:
gender
Male      3943
Female    2251
Name: count, dtype: int64

major Distribution:
major
Humanities    2481
STEM          1875
Business      1838
Name: count, dtype: int64

race Distribution:
race
White       1456
Asian       1147
Black        916
Hispanic     596
Other        237
Name: count, dtype: int64

work_industry Distribution:
work_industry
Consulting               1619
PE/VC                     907
Technology                716
Nonprofit/Gov             651
Investment Banking        580
Financial Services        451
Other                     421
Health Care               334
Investment Management     166
CPG                       114
Real Estate               111
Media/Entertainment        59
Retail                     33
Energy                     32
Name: count, dtype: int64

admission Distribution:
admission
Admit       900
Waitlist    100
Name: count, dtype: int64


In [13]:
data = data.dropna(subset=['admission'])

In [14]:
data['race'] = data['race'].fillna('Unknown')


In [15]:
print("\nEncoding Categorical Variables...")
label_encoders = {}
for feature in ['gender', 'major', 'race', 'work_industry', 'admission']:
    le = LabelEncoder()
    data[feature] = le.fit_transform(data[feature])
    label_encoders[feature] = le


Encoding Categorical Variables...


In [16]:
print("\nSplitting the dataset into features and target variable...")
X = data.drop(columns=['application_id', 'admission'])
y = data['admission']


Splitting the dataset into features and target variable...


In [17]:
print("\nSplitting the data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


Splitting the data into training and testing sets...


In [18]:
print("\nStandardizing the features...")
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


Standardizing the features...


In [19]:
print("\nTraining the SVM model...")
svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)


Training the SVM model...


In [20]:
print("\nSVM Model Evaluation:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_svm))


SVM Model Evaluation:
Confusion Matrix:
[[265   0]
 [ 35   0]]

Classification Report:
              precision    recall  f1-score   support

           0       0.88      1.00      0.94       265
           1       0.00      0.00      0.00        35

    accuracy                           0.88       300
   macro avg       0.44      0.50      0.47       300
weighted avg       0.78      0.88      0.83       300



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [21]:
print("\nTraining the KNN model...")
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)


Training the KNN model...


In [22]:
print("\nKNN Model Evaluation:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_knn))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_knn))


KNN Model Evaluation:
Confusion Matrix:
[[264   1]
 [ 35   0]]

Classification Report:
              precision    recall  f1-score   support

           0       0.88      1.00      0.94       265
           1       0.00      0.00      0.00        35

    accuracy                           0.88       300
   macro avg       0.44      0.50      0.47       300
weighted avg       0.78      0.88      0.83       300

