# Fairness Analysis

In [2]:
import pandas as pd
import glob
import requests
import json
from collections import defaultdict

In [104]:
baseline_file_paths = glob.glob('./logvolume_baseline/*.csv')
svd_file_paths = glob.glob('./logvolume_svd/*.csv')
dfs = pd.DataFrame()
for file in baseline_file_paths:
    column_types = {'user_id': 'string', 'recommendations': 'string'}
    df = pd.read_csv(file, dtype = column_types)[['user_id', 'recommendations']]
    dfs = pd.concat([dfs, df], axis=0)
for file in svd_file_paths:
    df = pd.read_csv(file, dtype = column_types)[['user_id', 'recommendations']]
    dfs = pd.concat([dfs, df], axis=0)
dfs = dfs.reset_index(drop=True)

In [105]:
dfs.head()

Unnamed: 0,user_id,recommendations
0,257870,"['the+shawshank+redemption+1994', 'the+godfath..."
1,179100,"['the+shawshank+redemption+1994', 'the+godfath..."
2,64290,"['the+shawshank+redemption+1994', 'the+godfath..."
3,914800,"['the+shawshank+redemption+1994', 'the+godfath..."
4,570960,"['the+shawshank+redemption+1994', 'the+godfath..."


In [112]:
dfs.shape

(3932620, 2)

In [174]:
# Sample 10000 rows randomly from predictions made from April 17 to April 23.
sample_dfs = dfs.sample(n=1000, random_state=72).reset_index(drop=True) 

In [175]:
userids = sample_dfs.user_id
user_info = {}
movie_info = defaultdict(list)
indices = list(sample_dfs.user_id.index)
for i in indices:
    userid = userids[i]
    user_api_url = f'http://128.2.204.215:8080/user/{userid}'
    user_response = requests.get(user_api_url)
    
    if user_response.status_code == 200:
        user_response_data = json.loads(user_response.text)
        user_info[i] = user_response_data
        
    else:
        print('Error occurred: Status Code', response.status_code)
        print(userid)
    movieids = list(map(lambda x: x[1:-1], sample_dfs.recommendations[i][1:-1].split(", ")))
    for movieid in movieids:  
        movie_api_url = f'http://128.2.204.215:8080/movie/{movieid}'
        movie_response = requests.get(movie_api_url)
        
        if movie_response.status_code == 200:
            movie_response_data = json.loads(movie_response.text)
            movie_info[i].append(movie_response_data['genres'])

In [113]:
user_info[999]

{'user_id': 733594, 'age': 28, 'occupation': 'scientist', 'gender': 'F'}

In [121]:
movie_info[999]

[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name': 'Crime'}]

## We would like to see if `Romance` and `Action` movies are recommended equally to males and females

In [184]:
def movie_genre_ratio_by_gender(eval_genre):
    female_count = sum([1 if user_info[i]['gender'] == 'F' else 0 for i in user_info.keys()])
    male_count = len(user_info.keys()) - female_count
    female_genre_count = 0
    male_genre_count = 0
    for i in indices:
        recommendations = movie_info[i]
        has_genre_male = False
        has_genre_female = False
        for rec in recommendations:
            for genre in rec:
                if genre['name'] == eval_genre and user_info[i]['gender'] == 'F' and not has_genre_female:
                    female_genre_count += 1
                    has_genre_female = True
                if genre['name'] == eval_genre and user_info[i]['gender'] == 'M' and not has_genre_male:
                    male_genre_count += 1
                    has_genre_male = True
    return female_genre_count/female_count, male_genre_count/male_count

### Calculate: $P(Y' = 1| A= male)$ Where $Y'$ is Romance

In [185]:
movie_genre_ratio_by_gender('Romance')[1]

0.9856801909307876

### Calculate: $P(Y' = 1| A= female)$ Where $Y'$ is Romance

In [186]:
movie_genre_ratio_by_gender('Romance')[0]

1.0

$$\frac{P(Y' = 1| A= male)}{ P(Y' = 1| A= female)}  = 0.9857 \ge 0.8$$

Therefore, the probability that romance movies are recommended to females and males satisfies the four-fifth rule, which we conclude our model is fair in recommending science fictions movies across different genders.

### Calculate: $P(Y' = 1| A= male)$ Where $Y'$ is Action

In [187]:
movie_genre_ratio_by_gender('Action')[1]

1.0

### Calculate: $P(Y' = 1| A= female)$ Where $Y'$ is Action

In [188]:
movie_genre_ratio_by_gender('Action')[0]

1.0

$$\frac{P(Y' = 1| A= male)}{ P(Y' = 1| A= female)}  = 1 \ge 0.8$$

Therefore, the probability that action movies are recommended to females and males satisfies the four-fifth rule, which we conclude our model is fair in recommending science fictions movies across different genders.