In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import warnings

In [2]:
files = ["cleaned.csv"]
dfs = []
for file in files:
    df = pd.read_csv(file)
    dfs.append(df)
df = pd.concat(dfs, axis=0)

In [3]:
df['gender'].value_counts()

gender
male      10345
female    10345
Name: count, dtype: int64

In [4]:
df['age'].value_counts()

age
twenties    14695
fifties      5995
Name: count, dtype: int64

In [5]:
plt.figure(figsize=(12, 10))
df['gender'] = df['gender'].map({'male': 0, 'female': 1})
df['age'] = df['age'].map({'twenties': 0, 'fifties': 1})
print(df.shape)

(20690, 107)


<Figure size 1200x1000 with 0 Axes>

In [6]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
def analyze_feature_correlations(df, threshold=0.5):

    numeric_df = df.select_dtypes(include=['number'])

    corr_matrix = numeric_df.corr()

    # Find high correlations
    high_corr = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                high_corr.append((corr_matrix.columns[i], corr_matrix.columns[j],
                                 corr_matrix.iloc[i, j]))
    return high_corr
correlated_features = analyze_feature_correlations(df, 0.8)
def features_to_keep(all_features, corr_features):
    features_set = set()
    to_keep = []

    for x, y, z in corr_features:
        if x not in features_set and y not in features_set:
            to_keep.append(x)
            features_set.add(x)
            features_set.add(y)

    for feature in all_features:
        if feature not in features_set:
            to_keep.append(feature)

    return to_keep
remaining_features = features_to_keep(df.select_dtypes(include=['number']).columns, correlated_features)
remaining_features.append('age')
remaining_features.append('gender')
remaining_features = list(set(remaining_features))
df = df[remaining_features]
X = df.drop('gender', axis=1).drop('age',axis=1)
print(X.shape, df.shape)

(20690, 99) (20690, 101)


In [7]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
def grid_search_sequential(model_gender, model_age,params_gender, params_age, X_reduced,df, proba = False, verbose_level = 2, mode = 'both'):
    X_reduced = np.array(X_reduced)
    y = np.array(df['gender'] + 2 * df['age'])
    X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)
    y_train_gender = y_train % 2
    y_train_age = y_train // 2
    y_test_gender = y_test % 2
    y_test_age = y_test // 2
    if params_gender is not None:
        gender_grid = RandomizedSearchCV(model_gender, param_distributions=params_gender, cv=5, n_jobs=-1, verbose=verbose_level)
        gender_grid.fit(X_train, y_train_gender)
        gender_classifier = gender_grid.best_estimator_
        gender_best_params = gender_grid.best_params_
        gender_best_score = gender_grid.best_score_
    else:
        gender_classifier = model_gender
        model_gender.fit(X_train,y_train_gender)
        gender_best_score = None
        gender_best_params = None
    if(mode == 'gender'):
        gender_pred_test = gender_classifier.predict(X_test).reshape(-1, 1)
        gender_acc = accuracy_score(y_test_gender, gender_pred_test)
        print("Test set accuracy", gender_acc)
        print("Grid Accuracy", gender_best_score)
        return gender_classifier, gender_best_params
    ## cross prediction
    cv_gender_pred = cross_val_predict(
        gender_classifier,
        X_train,
        y_train_gender,
        cv=5,
        method='predict_proba' if proba else 'predict'
    )

    # Format prediction as new feature
    if proba:
        predicted_gender = cv_gender_pred[:, 1].reshape(-1, 1)
    else:
        predicted_gender = cv_gender_pred.reshape(-1, 1)
    ## concatenate gender
    X_train = np.concatenate([X_train, predicted_gender], axis=1)
    if params_age is not None:
        age_grid = RandomizedSearchCV(model_age, param_distributions=params_age, cv=5, n_jobs=-1, verbose=verbose_level)
        age_grid.fit(X_train, y_train_age)
        age_classifier = age_grid.best_estimator_
        age_best_params = age_grid.best_params_
    else:
        age_classifier = model_age
        model_age.fit(X_train,y_train_age)
        age_best_params = None

    ### accuracy
    gender_pred_test = gender_classifier.predict(X_test).reshape(-1, 1)
    gender_acc = accuracy_score(y_test_gender, gender_pred_test)
    print("Gender Accuracy:", gender_acc)
    if(proba):
        gender_proba_test = gender_classifier.predict_proba(X_test)[:,1].reshape(-1, 1)
        X_test = np.concatenate([X_test, gender_proba_test], axis=1)
    else:
        X_test = np.concatenate([X_test, gender_pred_test], axis=1)
    age_pred_test = age_classifier.predict(X_test).reshape(-1, 1)
    age_acc = accuracy_score(y_test_age, age_pred_test)
    print("Age Accuracy:", age_acc)

    gender_age = gender_pred_test + 2 * age_pred_test
    total_acc = accuracy_score(gender_age, y_test)

    print("Total Accuracy:", total_acc)

    return gender_classifier, age_classifier, gender_best_params, age_best_params, total_acc

In [8]:
def predict_gender_age(model_gender, model_age, X, proba):
    gender = model_gender.predict(X).reshape(-1, 1)
    if(proba):
        gender_proba = model_gender.predict_proba(X)[:,1].reshape(-1, 1)
        X = np.concatenate([X, gender_proba], axis=1)
    else:
        X = np.concatenate([X, gender], axis = 1)
    age = model_age.predict(X).reshape(-1, 1)
    return (gender + 2 * age).flatten()

Sequential Accuracy with the result gender (not proba)
Gender 0.88, age = 0.81, both 0.74

Sequential Accuracy with the result gender proba both 73, age worse than above 0.73


After CHANGING the grid search for more features =>

In [9]:
from sklearn import neighbors
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from Linear_Model_Tree import LinearModelTree
from sklearn.naive_bayes import GaussianNB

et = ExtraTreesClassifier()
params_dict = {
    'et': {
        'n_estimators': [500,1000,1500,2000,2500],
        'max_depth': [None, 5, 10, 20,30,50],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2',15,20,25,30,35,40],
    }
}
model_g, model_a, params_g, params_a, total = grid_search_sequential(et, et, params_dict['et'], params_dict['et'], X, df, proba = True, verbose_level = 2, mode = 'both')

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=50, max_features=25, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time=  25.8s
[CV] END max_depth=30, max_features=40, min_samples_leaf=4, min_samples_split=10, n_estimators=1000; total time=  37.9s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=1500; total time=  16.7s
[CV] END max_depth=50, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time=  17.9s
[CV] END max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=1000; total time=  13.3s
[CV] END max_depth=5, max_features=25, min_samples_leaf=2, min_samples_split=2, n_estimators=1500; total time=  15.3s
[CV] END max_depth=10, max_features=40, min_samples_leaf=2, min_samples_split=2, n_estimators=1500; total time=  40.5s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators



[CV] END max_depth=50, max_features=25, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time=  25.7s
[CV] END max_depth=20, max_features=40, min_samples_leaf=1, min_samples_split=2, n_estimators=1500; total time= 1.0min
[CV] END max_depth=50, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time=  17.9s
[CV] END max_depth=5, max_features=25, min_samples_leaf=2, min_samples_split=2, n_estimators=1500; total time=  15.8s
[CV] END max_depth=10, max_features=35, min_samples_leaf=4, min_samples_split=10, n_estimators=2500; total time=  58.0s
[CV] END max_depth=20, max_features=20, min_samples_leaf=4, min_samples_split=5, n_estimators=500; total time=  12.3s
[CV] END max_depth=None, max_features=30, min_samples_leaf=4, min_samples_split=5, n_estimators=2500; total time= 1.4min
[CV] END max_depth=50, max_features=25, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time=  26.0s
[CV] END max_depth=20, max_features=40, min_s

In [10]:
print(params_g, params_a, total)

{'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 40, 'max_depth': 20} {'n_estimators': 2000, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 35, 'max_depth': 50} 0.8320444659255679


In [11]:
X_reduced = np.array(X)
y = np.array(df['gender'] + 2 * df['age'])
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)
combined_grid = RandomizedSearchCV(et, param_distributions=params_dict['et'], cv=5, n_jobs=-1, verbose=2)
combined_grid.fit(X_train, y_train)
best_combined_model = combined_grid.best_estimator_
best_combined_params = combined_grid.best_params_
best_combined_score = combined_grid.best_score_
print("Best parameters:", best_combined_params)
print("Best score:", best_combined_score)
print("Test set accuracy:", best_combined_model.score(X_test, y_test))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=50, max_features=35, min_samples_leaf=2, min_samples_split=5, n_estimators=2000; total time= 1.4min
[CV] END max_depth=5, max_features=35, min_samples_leaf=1, min_samples_split=5, n_estimators=500; total time=   6.2s
[CV] END max_depth=5, max_features=35, min_samples_leaf=1, min_samples_split=5, n_estimators=500; total time=   6.5s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=2000; total time=  34.4s
[CV] END max_depth=50, max_features=30, min_samples_leaf=2, min_samples_split=2, n_estimators=2000; total time= 1.4min
[CV] END max_depth=50, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=1500; total time=  21.6s




[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=1000; total time=  12.4s
[CV] END max_depth=50, max_features=20, min_samples_leaf=1, min_samples_split=10, n_estimators=1000; total time=  28.4s
[CV] END max_depth=50, max_features=20, min_samples_leaf=1, min_samples_split=10, n_estimators=1000; total time=  28.2s
[CV] END max_depth=50, max_features=20, min_samples_leaf=1, min_samples_split=10, n_estimators=1000; total time=  28.4s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=2000; total time=  34.4s
[CV] END max_depth=None, max_features=20, min_samples_leaf=4, min_samples_split=5, n_estimators=2000; total time=  53.2s
[CV] END max_depth=None, max_features=20, min_samples_leaf=4, min_samples_split=5, n_estimators=2000; total time=  53.4s
[CV] END max_depth=50, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=1500; total time=  21.8s
[CV] END max_depth=None, max_f

In [None]:
KNN = KNeighborsClassifier()
knn_params = {
    'n_neighbors': [1, 3, 5, 7, 9, 11,13,15,17,19,21],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
}
_, model_a_knn, _, params_a_knn, total_knn = grid_search_sequential(model_g, KNN, None, knn_params, X, df, proba = False, verbose_level = 2, mode = 'both')

rf = RandomForestClassifier()
rf_params = {
    'n_estimators': [10, 20, 50, 100,300,500,700,1000, 1500],
    'max_depth': [None, 5, 10, 20,30,50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2',15,20,25,30,35,40],
}
_, model_a_rf, _, params_a_rf, total_rf = grid_search_sequential(model_g, rf, None, rf_params, X, df, proba = False, verbose_level = 2, mode = 'both')


print("Best parameters for KNN:", params_a_knn)
print("Best parameters for RF:", params_a_rf)
print("Best parameters for ET:", params_a)
print("Best score for KNN:", total_knn)
print("Best score for RF:", total_rf)
print("Best score for ET:", total)
print("Test set accuracy for KNN:", model_a_knn.score(X_test, y_test))
print("Test set accuracy for RF:", model_a_rf.score(X_test, y_test))
print("Test set accuracy for ET:", model_a.score(X_test, y_test))


In [None]:
svm = SVC(probability=True)
svm_params = {
    'C': [0.1, 1, 10, 100,1000, 10000],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': [2, 3, 4],
    'gamma': ['scale', 'auto'],
}
_, model_a_svm, _, params_a_svm, total_svm = grid_search_sequential(model_g, svm, None, svm_params, X, df, proba = False, verbose_level = 2, mode = 'both')
print("Best parameters for SVM:", params_a_svm)
print("Best score for SVM:", total_svm)
print("Test set accuracy for SVM:", model_a_svm.score(X_test, y_test))


In [None]:
from sklearn.ensemble import StackingClassifier
lr_params = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
}
base_estimators = [
    ('knn', model_a_knn),
    ('rf', model_a_rf),
    ('et', model_a),
    ('svc', model_a_svm),
]
meta_classifier = LogisticRegression(max_iter=1000)
stacking_clf = StackingClassifier(
    estimators=base_estimators,
    final_estimator=meta_classifier,
    cv=5,
    stack_method='auto',
    n_jobs=-1
)
stacking_clf.fit(X_train, y_train // 2)

In [None]:
stacking_pred = stacking_clf.predict(X_test)
stacking_acc = accuracy_score(y_test // 2, stacking_pred)
print("Stacking Classifier Age Accuracy:", stacking_acc)
y_pred_test = stacking_clf.predict(X_test) * 2 + model_g.predict(X_test)
print("Stacking Classifier Total Accuracy:", accuracy_score(y_test, y_pred_test))