In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import warnings

In [2]:
files = ["fixed.csv"]
dfs = []
for file in files:
    df = pd.read_csv(file)
    dfs.append(df)
df = pd.concat(dfs, axis=0)

In [3]:
df['gender'].value_counts()

gender
male      2211
female     555
Name: count, dtype: int64

In [4]:
df['age'].value_counts()

age
twenties    2220
fifties      546
Name: count, dtype: int64

In [5]:
plt.figure(figsize=(12, 10))
df['gender'] = df['gender'].map({'male': 0, 'female': 1})
df['age'] = df['age'].map({'twenties': 0, 'fifties': 1})
print(df.shape)

(2766, 106)


<Figure size 1200x1000 with 0 Axes>

In [6]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
def analyze_feature_correlations(df, threshold=0.5):

    numeric_df = df.select_dtypes(include=['number'])

    corr_matrix = numeric_df.corr()

    # Find high correlations
    high_corr = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                high_corr.append((corr_matrix.columns[i], corr_matrix.columns[j],
                                 corr_matrix.iloc[i, j]))
    return high_corr
correlated_features = analyze_feature_correlations(df, 0.8)
def features_to_keep(all_features, corr_features):
    features_set = set()
    to_keep = []

    for x, y, z in corr_features:
        if x not in features_set and y not in features_set:
            to_keep.append(x)
            features_set.add(x)
            features_set.add(y)

    for feature in all_features:
        if feature not in features_set:
            to_keep.append(feature)

    return to_keep
remaining_features = features_to_keep(df.select_dtypes(include=['number']).columns, correlated_features)
remaining_features.append('age')
remaining_features.append('gender')
remaining_features = list(set(remaining_features))
df = df[remaining_features]
X = df.drop('gender', axis=1).drop('age',axis=1)
print(X.shape, df.shape)

(2766, 98) (2766, 100)


In [7]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
def grid_search_sequential(model_gender, model_age,params_gender, params_age, X_reduced,df, proba = False, verbose_level = 2, mode = 'both'):
    X_reduced = np.array(X_reduced)
    y = np.array(df['gender'] + 2 * df['age'])
    X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)
    y_train_gender = y_train % 2
    y_train_age = y_train // 2
    y_test_gender = y_test % 2
    y_test_age = y_test // 2
    if params_gender is not None:
        gender_grid = RandomizedSearchCV(model_gender, param_distributions=params_gender, cv=5, n_jobs=-1, verbose=verbose_level)
        gender_grid.fit(X_train, y_train_gender)
        gender_classifier = gender_grid.best_estimator_
        gender_best_params = gender_grid.best_params_
        gender_best_score = gender_grid.best_score_
    else:
        gender_classifier = model_gender
        model_gender.fit(X_train,y_train_gender)
        gender_best_score = None
        gender_best_params = None
    if(mode == 'gender'):
        gender_pred_test = gender_classifier.predict(X_test).reshape(-1, 1)
        gender_acc = accuracy_score(y_test_gender, gender_pred_test)
        print("Test set accuracy", gender_acc)
        print("Grid Accuracy", gender_best_score)
        return gender_classifier, gender_best_params
    ## cross prediction
    cv_gender_pred = cross_val_predict(
        gender_classifier,
        X_train,
        y_train_gender,
        cv=5,
        method='predict_proba' if proba else 'predict'
    )

    # Format prediction as new feature
    if proba:
        predicted_gender = cv_gender_pred[:, 1].reshape(-1, 1)
    else:
        predicted_gender = cv_gender_pred.reshape(-1, 1)
    ## concatenate gender
    X_train = np.concatenate([X_train, predicted_gender], axis=1)
    if params_age is not None:
        age_grid = RandomizedSearchCV(model_age, param_distributions=params_age, cv=5, n_jobs=-1, verbose=verbose_level)
        age_grid.fit(X_train, y_train_age)
        age_classifier = age_grid.best_estimator_
        age_best_params = age_grid.best_params_
    else:
        age_classifier = model_age
        model_age.fit(X_train,y_train_age)
        age_best_params = None

    ### accuracy
    gender_pred_test = gender_classifier.predict(X_test).reshape(-1, 1)
    gender_acc = accuracy_score(y_test_gender, gender_pred_test)
    print("Gender Accuracy:", gender_acc)
    if(proba):
        gender_proba_test = gender_classifier.predict_proba(X_test)[:,1].reshape(-1, 1)
        X_test = np.concatenate([X_test, gender_proba_test], axis=1)
    else:
        X_test = np.concatenate([X_test, gender_pred_test], axis=1)
    age_pred_test = age_classifier.predict(X_test).reshape(-1, 1)
    age_acc = accuracy_score(y_test_age, age_pred_test)
    print("Age Accuracy:", age_acc)

    gender_age = gender_pred_test + 2 * age_pred_test
    total_acc = accuracy_score(gender_age, y_test)

    print("Total Accuracy:", total_acc)

    return gender_classifier, age_classifier, gender_best_params, age_best_params, total_acc

In [8]:
def predict_gender_age(model_gender, model_age, X, proba):
    gender = model_gender.predict(X).reshape(-1, 1)
    if(proba):
        gender_proba = model_gender.predict_proba(X)[:,1].reshape(-1, 1)
        X = np.concatenate([X, gender_proba], axis=1)
    else:
        X = np.concatenate([X, gender], axis = 1)
    age = model_age.predict(X).reshape(-1, 1)
    return (gender + 2 * age).flatten()

Sequential Accuracy with the result gender (not proba)
Gender 0.88, age = 0.81, both 0.74

Sequential Accuracy with the result gender proba both 73, age worse than above 0.73


After CHANGING the grid search for more features =>

In [11]:
from sklearn import neighbors
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from Linear_Model_Tree import LinearModelTree
from sklearn.naive_bayes import GaussianNB

rf = RandomForestClassifier()
et = ExtraTreesClassifier()
params_dict = {
    'rf': {
        'n_estimators': [10, 20, 50, 100,300,500,700,1000],
        'max_depth': [None, 5, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2',15,20,25,30,35,40],
    },
    'et': {
        'n_estimators': [10, 20, 50, 100,300,500,700,1000],
        'max_depth': [None, 5, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2',15,20,25,30,35,40],
    }
}
model, params = grid_search_sequential(rf, None, params_dict['rf'], None, X, df, proba = False, verbose_level = 2, mode = 'gender')

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Test set accuracy 0.9512635379061372
Grid Accuracy 0.9471037659724422


In [12]:
print(params)

{'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 30, 'max_depth': 20}
