In [1]:
import pandas as pd

In [2]:
users = pd.read_csv("../data/ml-1m/users.csv")
movies = pd.read_csv("../data/ml-1m/movies.csv")
ratings = pd.read_csv("../data/ml-1m/ratings.dat", engine="python", sep="::", header=None)

ratings.columns = ["user_id", "movie_id", "rating", "timestamp"]

In [3]:
df = pd.merge(ratings, movies, how="inner", on="movie_id")
df = pd.merge(df, users, how="inner", on="user_id")
df

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genre,year,gender,age,occupation
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,1970,F,Under 18,K-12 student
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,1990,F,Under 18,K-12 student
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance,1960,F,Under 18,K-12 student
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,2000,F,Under 18,K-12 student
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy,1990,F,Under 18,K-12 student
...,...,...,...,...,...,...,...,...,...,...
1000204,4211,3791,2,965319075,Footloose (1984),Drama,1980,M,45-49,customer service
1000205,4211,3806,3,965319138,MacKenna's Gold (1969),Western,1960,M,45-49,customer service
1000206,4211,3840,4,965319197,Pumpkinhead (1988),Horror,1980,M,45-49,customer service
1000207,4211,3766,2,965319138,Missing in Action (1984),Action|War,1980,M,45-49,customer service


In [4]:
genre_counts = {}

for _, row in df.iterrows():
    target = row.gender
    # if row.rating < 4: continue
    if target not in genre_counts:
        genre_counts[target] = {}
    for genre in row.genre.split("|"):
        if genre not in genre_counts[target]:
            genre_counts[target][genre] = 0
        genre_counts[target][genre] += 1

In [5]:
genre_df = pd.DataFrame(genre_counts)
genre_df /= genre_df.sum()
genre_df

Unnamed: 0,F,M
Drama,0.194002,0.160649
Animation,0.024155,0.01947
Children's,0.042134,0.031875
Musical,0.026693,0.017563
Romance,0.099414,0.060923
Comedy,0.190283,0.163113
Action,0.090229,0.132721
Adventure,0.054023,0.06681
Fantasy,0.017231,0.017284
Sci-Fi,0.054157,0.081393


In [27]:
genre_df["Ave"] = genre_df.apply(lambda row: (row.M + row.F) / 2, axis=1)
genre_df["M_p"] = genre_df.apply(lambda row: (row.M - row.Ave) / row.Ave, axis=1)
genre_df["F_p"] = genre_df.apply(lambda row: (row.F - row.Ave) / row.Ave, axis=1)
genre_df

Unnamed: 0,F,M,Ave,M_p,F_p
Drama,0.194002,0.160649,0.177326,-0.094046,0.094046
Animation,0.024155,0.01947,0.021813,-0.107392,0.107392
Children's,0.042134,0.031875,0.037004,-0.138611,0.138611
Musical,0.026693,0.017563,0.022128,-0.206307,0.206307
Romance,0.099414,0.060923,0.080168,-0.240059,0.240059
Comedy,0.190283,0.163113,0.176698,-0.07688,0.07688
Action,0.090229,0.132721,0.111475,0.190593,-0.190593
Adventure,0.054023,0.06681,0.060416,0.10583,-0.10583
Fantasy,0.017231,0.017284,0.017258,0.001521,-0.001521
Sci-Fi,0.054157,0.081393,0.067775,0.200933,-0.200933


In [28]:
genre_df.sort_values("F_p", ascending=False)

Unnamed: 0,F,M,Ave,M_p,F_p
Romance,0.099414,0.060923,0.080168,-0.240059,0.240059
Musical,0.026693,0.017563,0.022128,-0.206307,0.206307
Children's,0.042134,0.031875,0.037004,-0.138611,0.138611
Animation,0.024155,0.01947,0.021813,-0.107392,0.107392
Drama,0.194002,0.160649,0.177326,-0.094046,0.094046
Comedy,0.190283,0.163113,0.176698,-0.07688,0.07688
Mystery,0.019718,0.018925,0.019321,-0.020518,0.020518
Documentary,0.003834,0.003741,0.003788,-0.012353,0.012353
Fantasy,0.017231,0.017284,0.017258,0.001521,-0.001521
Film-Noir,0.008305,0.00881,0.008557,0.029459,-0.029459


In [8]:
genre_df.sort_values("M_p", ascending=False)

Unnamed: 0,F,M,Ave,M_p,F_p
Western,0.006872,0.010782,0.008827,0.221431,-0.221431
Sci-Fi,0.054157,0.081393,0.067775,0.200933,-0.200933
Action,0.090229,0.132721,0.111475,0.190593,-0.190593
Horror,0.028927,0.038694,0.03381,0.144446,-0.144446
Adventure,0.054023,0.06681,0.060416,0.10583,-0.10583
War,0.027855,0.034109,0.030982,0.100927,-0.100927
Crime,0.032498,0.039539,0.036018,0.097736,-0.097736
Thriller,0.07967,0.093599,0.086634,0.080388,-0.080388
Film-Noir,0.008305,0.00881,0.008557,0.029459,-0.029459
Fantasy,0.017231,0.017284,0.017258,0.001521,-0.001521


In [18]:
f_c = genre_df.sort_values("F_p", ascending=False).index.to_list()
f_v = [
    "Crime",
    "Film-Noir",
    "Western",
    "Horror",
    "Musical",
    "Action",
    "Documentary",
    "Drama",
    "Thriller",
    "Comedy",
    "Sci-Fi",
    "Mystery",
    "Children's",
    "Romance",
    "War",
    "Adventure",
    "Fantasy",
    "Animation"
]

p_c = []
for e in f_c:
    for i, e2 in enumerate(f_v):
        if e == e2:
            p_c.append(i)

p_v = list(range(len(f_v)))

In [20]:
from scipy.stats import spearmanr

spearmanr(p_c, p_v)

SpearmanrResult(correlation=-0.3085655314757482, pvalue=0.2128287059586837)