In [2]:
# 1. Loading the Dataset
import pandas as pd

# Load the users dataset
users = pd.read_csv('users.csv')

# 2. Data Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
import numpy as np

# Handling categorical variables
company_encoder = OneHotEncoder()
company_features = company_encoder.fit_transform(users[['company']])

# Scaling numerical features
scaler = StandardScaler()
age_feature = scaler.fit_transform(users[['age']])

# Combine features
X = np.concatenate([company_features.toarray(), age_feature], axis=1)

# Target variable (gender)
y = users['gender']

# Encode target variable
gender_encoder = LabelEncoder()
y_encoded = gender_encoder.fit_transform(y)

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# 3. Model Training
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Train a classification model
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

# Predict on test data
y_pred = classifier.predict(X_test)

# 4. Model Validation
# Calculate accuracy and other metrics
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(report)

# 5. Saving the Trained Model
import joblib

joblib.dump(classifier, 'gender_classification_model.pkl')

# 6. Saving the Encoder and Scaler
joblib.dump(company_encoder, 'company_encoder.pkl')
joblib.dump(scaler, 'age_scaler.pkl')
joblib.dump(gender_encoder, 'gender_encoder.pkl')


Accuracy: 0.2947761194029851
              precision    recall  f1-score   support

           0       0.29      0.29      0.29        89
           1       0.29      0.23      0.26        94
           2       0.30      0.36      0.33        85

    accuracy                           0.29       268
   macro avg       0.29      0.30      0.29       268
weighted avg       0.29      0.29      0.29       268



['gender_encoder.pkl']