In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MultiLabelBinarizer
from itertools import combinations
from multiprocessing import Pool, cpu_count
import numpy as np
from tqdm import tqdm

In [None]:
animes_df = pd.read_csv("./data/animes.csv")
animes_df.info()

In [None]:
animes_df = animes_df.drop_duplicates(subset=['uid'], keep='first')
animes_df.head()

In [None]:
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(animes_df['genre'])
genre_df = pd.DataFrame(genre_matrix, columns=mlb.classes_, index=animes_df.index)

genre_df['popularity'] = animes_df['popularity']

genre_corr = genre_df.corr()

genre_corr_popularity = genre_corr['popularity'].drop('popularity').sort_values(key=abs, ascending=False)

print("Top Genres Correlated with Popularity:\n", genre_corr_popularity.head(20))

def plot_correlations(correlations, title):
    plt.figure(figsize=(16, 8))
    correlations.plot(kind='barh')
    plt.title(title)
    plt.xlabel('Correlation')
    plt.ylabel('Genres')
    plt.show()

plot_correlations(genre_corr_popularity.head(20), 'Top 20 Genres Correlated with Popularity')

In [None]:
def generate_combinations(genres, n):
    return list(combinations(genres, n))

def flatten_combinations(combinations_list):
    return [' & '.join(combo) for combo in combinations_list]

def process_genre_combinations(df_chunk, n):
    return df_chunk.apply(lambda row: flatten_combinations(generate_combinations(row['genre'], n)), axis=1)

def parallelize_dataframe(df, func, n):
    df_split = np.array_split(df, cpu_count())
    pool = Pool(cpu_count())
    results = []
    for chunk in tqdm(pool.imap_unordered(func, [(chunk, n) for chunk in df_split]), total=len(df_split)):
        results.extend(chunk)
    pool.close()
    pool.join()
    return pd.Series(results, index=df.index)

tqdm.pandas()

animes_df['genre_combinations_2'] = parallelize_dataframe(animes_df, process_genre_combinations, 2)
animes_df['genre_combinations_3'] = parallelize_dataframe(animes_df, process_genre_combinations, 3)

all_combinations = set(animes_df['genre_combinations_2'].sum() + animes_df['genre_combinations_3'].sum())

mlb = MultiLabelBinarizer(classes=sorted(all_combinations))
genre_comb_matrix_2 = mlb.fit_transform(animes_df['genre_combinations_2'])
genre_comb_matrix_3 = mlb.fit_transform(animes_df['genre_combinations_3'])

genre_comb_matrix = np.hstack([genre_comb_matrix_2, genre_comb_matrix_3])

genre_comb_df = pd.DataFrame(genre_comb_matrix, columns=mlb.classes_, index=animes_df.index)

genre_comb_df['popularity'] = animes_df['popularity']

genre_comb_corr = genre_comb_df.corr()

genre_comb_corr_popularity = genre_comb_corr['popularity'].drop('popularity').sort_values(key=abs, ascending=False)

In [None]:
print("Top Genre Combinations Correlated with Popularity:\n", genre_comb_corr_popularity.head(20))

def plot_correlations(correlations, title):
    plt.figure(figsize=(10, 8))
    correlations.plot(kind='barh')
    plt.title(title)
    plt.xlabel('Correlation')
    plt.ylabel('Genre Combinations')
    plt.show()

plot_correlations(genre_comb_corr_popularity.head(20), 'Top 20 Genre Combinations Correlated with Popularity')