In [10]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('train.csv')

# Assuming the target column is named 'personality'
y = df['Personality']
X = df.drop('Personality', axis=1)

In [11]:
# Check for null/empty values in X and y separately

# For X (features)
print("Null values in X:")
print(X.isnull().sum())

# For y (target)
print("\nNull values in y:")
print(y.isnull().sum())

Null values in X:
id                              0
Time_spent_Alone             1190
Stage_fear                   1893
Social_event_attendance      1180
Going_outside                1466
Drained_after_socializing    1149
Friends_circle_size          1054
Post_frequency               1264
dtype: int64

Null values in y:
0


In [12]:
# Convert 'Stage_fear' and 'Drained_after_socializing' from Yes/No to 1/0
X['Stage_fear'] = X['Stage_fear'].map({'Yes': 1, 'No': 0})
X['Drained_after_socializing'] = X['Drained_after_socializing'].map({'Yes': 1, 'No': 0})

In [13]:
from sklearn.impute import KNNImputer

# Create the KNN imputer
imputer = KNNImputer(n_neighbors=5)

# Apply the imputer to X (excluding non-numeric columns)
X_imputed = X.copy()
numeric_cols = X_imputed.select_dtypes(include=['number']).columns
X_imputed[numeric_cols] = imputer.fit_transform(X_imputed[numeric_cols])

# X_imputed now has missing values filled

In [14]:
X_imputed['Stage_fear'] = X_imputed['Stage_fear'].round().astype(int)
X_imputed['Drained_after_socializing'] = X_imputed['Drained_after_socializing'].round().astype(int)

In [15]:
X_imputed.isnull().sum()  # Check if there are still any null values

id                           0
Time_spent_Alone             0
Stage_fear                   0
Social_event_attendance      0
Going_outside                0
Drained_after_socializing    0
Friends_circle_size          0
Post_frequency               0
dtype: int64

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_imputed, y, test_size=0.2, random_state=42, stratify=y
)

# Create and train the Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict on the test set
y_pred = rf.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9713900134952766

Classification Report:
               precision    recall  f1-score   support

   Extrovert       0.98      0.98      0.98      2740
   Introvert       0.94      0.95      0.95       965

    accuracy                           0.97      3705
   macro avg       0.96      0.96      0.96      3705
weighted avg       0.97      0.97      0.97      3705



In [18]:
# Load the test data
test_df = pd.read_csv('test.csv')

# Prepare test features (assuming same columns as train, except no 'Personality')
X_test = test_df.copy()

# Convert 'Stage_fear' and 'Drained_after_socializing' from Yes/No to 1/0
X_test['Stage_fear'] = X_test['Stage_fear'].map({'Yes': 1, 'No': 0})
X_test['Drained_after_socializing'] = X_test['Drained_after_socializing'].map({'Yes': 1, 'No': 0})

# Impute missing values using the same KNNImputer as for train
X_test_imputed = X_test.copy()
numeric_cols_test = X_test_imputed.select_dtypes(include=['number']).columns
X_test_imputed[numeric_cols_test] = imputer.transform(X_test_imputed[numeric_cols_test])

# Round binary columns to ensure they are 0/1
X_test_imputed['Stage_fear'] = X_test_imputed['Stage_fear'].round().astype(int)
X_test_imputed['Drained_after_socializing'] = X_test_imputed['Drained_after_socializing'].round().astype(int)

# Predict Personality for test data
test_predictions = rf.predict(X_test_imputed)

# Add predictions to test_df
test_df['Predicted_Personality'] = test_predictions

# Show predictions
print(test_df[['id', 'Predicted_Personality']])

         id Predicted_Personality
0     18524             Extrovert
1     18525             Introvert
2     18526             Extrovert
3     18527             Extrovert
4     18528             Introvert
...     ...                   ...
6170  24694             Extrovert
6171  24695             Introvert
6172  24696             Extrovert
6173  24697             Extrovert
6174  24698             Introvert

[6175 rows x 2 columns]


In [19]:
test_df[['id', 'Predicted_Personality']].to_csv('test_predictions.csv', index=False)