### Looking at the performance/predictions of linear regression on FAST ratings

In [13]:
PREDICTIONS_FILE = 'predictions/linreg_predictions.csv'
TRUE_RATINGS_FILE = 'landmarks/landmarks_100.csv'
TRUE_GENDERS_FILE = 'landmarks/landmarks_100_binarygenderlabels.csv'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from IPython.display import display
pd.options.display.precision = 3
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 100

#load data from files
df_predictions = pd.read_csv(PREDICTIONS_FILE)
df_lm_fast = pd.read_csv(TRUE_RATINGS_FILE)
df_lm_gender = pd.read_csv(TRUE_GENDERS_FILE)
df_ratings = df_lm_fast['rating']
df_true_gender = df_lm_gender['gender']

df = pd.concat([df_predictions, df_ratings, df_true_gender], axis = 1)
df = df [:60]
display(df)

Unnamed: 0.1,Unnamed: 0,image_url,prediction,rating,gender
0,0.0,18_0_3_20170119151213151.jpg.chip,2.572,1.75,0
1,1.0,19_0_1_20170114025936564.jpg.chip,2.198,5.0,1
2,2.0,21_1_3_20170119154311378.jpg.chip,1.532,0.4,0
3,3.0,22_0_1_20170113193211629.jpg.chip,1.537,1.6,0
4,4.0,23_0_0_20170117144116074.jpg.chip,2.934,2.8,1
5,5.0,24_0_2_20170116165047009.jpg.chip,3.71,0.8,0
6,6.0,24_1_2_20170116173444326.jpg.chip,3.262,5.8,1
7,7.0,25_0_0_20170117191222353.jpg.chip,2.642,0.4,0
8,8.0,25_0_0_20170119171125583.jpg.chip,1.293,2.8,0
9,9.0,26_0_1_20170113154951523.jpg.chip,1.604,5.4,1


In [14]:
#Look at highest and lowest rated faces

df_highest = df.nlargest(10, 'prediction')
print('Highest rating predictions')
print(df_highest[['image_url', 'prediction', 'rating']])

df_smallest = df.nsmallest(10, 'prediction')
print('\nLowest rating predictions')
print(df_smallest[['image_url', 'prediction', 'rating']])

Highest rating predictions
                            image_url  prediction  rating
23  28_1_1_20170113012030016.jpg.chip       5.005     4.8
46  38_1_1_20170112204552523.jpg.chip       4.447     6.0
15  26_1_2_20170116184248819.jpg.chip       4.160     1.2
16  26_1_3_20170119155612442.jpg.chip       4.059     5.6
36  34_1_1_20170117132708007.jpg.chip       4.014     0.2
25  29_1_1_20170117194236505.jpg.chip       3.975     4.8
58  50_0_1_20170113173828618.jpg.chip       3.916     5.4
10  26_0_1_20170116192106714.jpg.chip       3.858     5.2
32  32_1_0_20170117091929838.jpg.chip       3.733     1.0
49  39_0_3_20170119191918705.jpg.chip       3.727     1.6

Lowest rating predictions
                            image_url  prediction  rating
31  32_0_1_20170117140805502.jpg.chip       1.168    0.40
17  27_0_3_20170119152415944.jpg.chip       1.251    5.25
8   25_0_0_20170119171125583.jpg.chip       1.293    2.80
2   21_1_3_20170119154311378.jpg.chip       1.532    0.40
3   22_0_1_2017011

In [15]:
#Look at accuracy at predicting gender, using <3.5 as male and >3.5 as female

_sum = 0
for i in range(len(df.index)):
    if (df.loc[i, 'prediction'] < 3.5 and df.loc[i, 'gender'] == 0) or (df.loc[i, 'prediction'] >= 3.5 and df.loc[i, 'gender'] == 1):
        _sum += 1
accuracy = _sum/len(df.index)

print(accuracy)

#The accuracy for logistic regression was: 0.8969072164948454
#however, these 100 faces were also part of the train set

0.48333333333333334


In [16]:
#Look at performance separately for males and females

df_m= df.loc[df['gender'] == 0] #males
mse_m = mean_squared_error(df_m['rating'], df_m['prediction'])
print("male mean squared error: ", mse_m)

df_f= df.loc[df['gender'] == 1] #females
mse_f = mean_squared_error(df_f['rating'], df_f['prediction'])
print("female mean squared error: ", mse_f)

mse_all = mean_squared_error(df['rating'], df['prediction'])
print("both mean squared error: ", mse_all)

male mean squared error:  4.095624410029314
female mean squared error:  5.917036471718121
both mean squared error:  4.884902970094464


In [17]:
#Look at variance of scores

all_var = df['prediction'].var()
f_var = df_f['prediction'].var()
m_var = df_m['prediction'].var()

print('variance: ', all_var, 
      '\nvariance males: ', m_var, 
      '\nvariance females: ', f_var)

variance:  0.7748608766481054 
variance males:  0.73536432831395 
variance females:  0.8371892443314881
