# CLUSTERING - armado de las temáticas de inmigración


Este Notebook se encarga del armado de las temáticas de inmigración, siguiendo estos pasos:

1. [Matrices $L2V$, distancia coseno, $TFIDF$](#matrices)
2. [Fast K-Medoids + Random Forest](#clusters-rf)
    - [Para distintos valores de $K$](#clusters-rf-k)
    - [10 repeticiones para $K$ = 500](#clusters-rf-10)
    - [Clúster más predictor](#clusters-rf-1)
    - [Comparar particiones y definir $T$ clústers más predictores](#clusters-rf-t)
3. [Reagrupación vía clustering jerárquico](#jerarq)
4. [Etiquetas finales](#etiquetas)


Inputs:
- Vocabulario único (stoi.pkl)
- Dataset a nivel de película (master_subt_content_cleaned_lite.pkl") con las siguientes variables:
    - tconst (string): identificador de película
    - final_lemmas (lista de strings): lemas únicos de cada película
    - just_migra (int): variable indicadora de si una película es de inmigración o no
    
Outputs:
- Matrices $L2V$, $TFIDF$ y filmids
- Dataset con las temáticas de inmigración final_clusters500.pkl 

In [None]:
# Importamos librerías 

## Módulos generales
from libraries import *

## Módulos con funciones creadas para este trabajo
## (requieren de haber importado previamente los módulos generales)
from limpieza_subt import *
from clustering import *

In [None]:
# Completar con directorios 
gitwd = ""
datawd = ""

## Importar datos

In [None]:
# open stoi
with open(datawd + "/stoi.pkl", 'rb') as inputfile: 
    stoi = pickle.load(inputfile)
UNK_IDX = stoi["<unk>"]

# by film dataset
master = pd.read_pickle(datawd + "/master_subt_content_cleaned_lite.pkl")  
byfilm = master[ (master.in_cleaned == 1) ].reset_index(drop = True)
byfilm = byfilm[(byfilm.main + byfilm.before2000 + byfilm.just_migra) > 0].reset_index(drop = True)
byfilm = byfilm[["tconst", "final_lemmas" ,"just_migra"]]
del(master)

In [None]:
byfilm.shape

<a id='matrices'></a>
## Matrices L2V, distancia coseno, TFIDF

### L2V

In [None]:
# dataset con vocabulario único
l2v = pd.DataFrame.from_dict(stoi, orient='index').reset_index()
l2v.columns = ["lemma", "stoi"]
l2v = l2v[~l2v.lemma.isin(["<unk>","<pad>"])]
l2v.reset_index(drop = True, inplace = True)
l2v.shape

In [None]:
# Obtener vectores de cada lema usando Glove
model = api.load("glove-wiki-gigaword-300")  # load glove vectors

l2v["WORD_VECTORS"] = [get_word_vector(word, model) for word in tqdm(l2v.lemma)]
df_transformed = l2v.WORD_VECTORS.apply(pd.Series)
df_transformed.columns = [f'dim_{i+1}' for i in range(len(df_transformed.columns))]
l2v = pd.concat([l2v[["lemma","stoi"]], df_transformed ], axis = 1)
del(df_transformed)
del(model)

# Guardar
l2v.to_pickle(datawd + "/l2v.pkl")
l2v.head(2)

### Distancia coseno

In [None]:
# Matriz de similaridad coseno
dims = [c for c in l2v.columns if "dim_" in c]
cosine_sim_matrix = cosine_similarity(l2v[dims], l2v[dims])

# Valor mínimo y máximo en la matriz y reescalamiento min-max
min_value = np.min(cosine_sim_matrix)
max_value = np.max(cosine_sim_matrix)

cosine_sim_matrix = (cosine_sim_matrix - min_value) / (max_value - min_value)

# De similaridad a distancia (PAM requiere matriz de distancias)
distance_matrix = 1 - cosine_sim_matrix
del(cosine_sim_matrix)

# save
with open(datawd + "/d.pkl", 'wb') as outputfile: 
    pickle.dump(distance_matrix, outputfile)

### TFIDF

In [None]:
# (A) Generar matriz de cuenta de lemas --------------------------------------------------------
counts = byfilm[["tconst", "final_lemmas"]].explode("final_lemmas").reset_index(drop = True)
counts["aux"] = 1
counts = counts.groupby(["tconst", "final_lemmas"], as_index = False).agg({"aux": sum})  # sum by tconst - lemma
counts["aux"] = counts.aux.astype("int")
counts.reset_index(inplace = True, drop = True)
counts["stoi"] = counts['final_lemmas'].map(stoi) 
print(min(counts.stoi), max(counts.stoi)) 

# Crear índices para guardar cuenta de lemas en una matriz esparsa (de lo contrario, no entra en memoria): IDs de cada película (tconst) asignados a un número (guardarlos en filmids)
counts["tconst"] = counts["tconst"].astype('category') # 28519 categories
counts["filmid"] =  counts["tconst"].cat.codes

filmids = counts[["tconst", "filmid"]].drop_duplicates().reset_index(drop = True)
filmids["tconst"] = filmids.tconst.astype("str")
filmids = filmids.merge(byfilm, 
                        how = "left",
                        on = "tconst")

filmids.to_pickle(datawd + "/filmids.pkl")
del(byfilm)


# Armar matriz esparsa de cuenta cuenta de lemas en cada película
sparse_matrix = sp.coo_matrix( ( counts['aux'], (counts["filmid"], counts['stoi'])) )
del counts
sparse_matrix  # 27018 columns because the maximum is 27017 but we also have 0 (whihc is empty)

In [None]:
for i in range(3):
    print(sparse_matrix.getcol(i).getnnz()) #  las columnas 0 y 1 están vacías porque corresponden a los tokens UNK y PAD

In [None]:
# (B) Generar matriz TFIDF ----------------------------------------------------------
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(sparse_matrix)
del(transformer, sparse_matrix)
tfidf

 # save
with open(datawd + "/tfidf.pkl", 'wb') as outputfile: 
    pickle.dump(tfidf, outputfile)

<a id='clusters-rf'></a>
## Fast K-Medoids + Random Forest 

In [None]:
# Import necesary datasets: l2v, distance_matrix, filmids, tfidf
filmids = pd.read_pickle(datawd + "/filmids.pkl")

with open(datawd + "/tfidf.pkl", 'rb') as inputfile: 
    tfidf = pickle.load(inputfile)
    
l2v = pd.read_pickle(datawd + "/l2v.pkl")

with open(datawd + "/d.pkl", 'rb') as inputfile: 
    distance_matrix = pickle.load(inputfile)

<a id='clusters-rf-k'></a>
### 1. Para distintos K

In [None]:
silhouettes = []
roc_aucs = []
n_lemmas = []
n_important = []

gc.collect()

for k in tqdm([50, 250, 500, 1000]): 

   
    seed(42)
    a = Clusters_RF(l2v, distance_matrix, filmids, tfidf,
                    k, rseed = 42)
    a.get_clusters()
    a.describe_clusters()
    a.get_silhouette()
    a.get_f2c()
    a.rf()
    
    
    n_i = np.sum(a.feature_importances.Importance > 0)
    print(f"Clusters con importancia mayor a 0: {n_i}")
    n_important.append(n_i)
    
    silhouettes.append(a.silhouette)
    roc_aucs.append(a.roc_auc)
    n_lemmas.append(np.mean(a.clusters.n_lemmas))

        

In [None]:
print(silhouettes,
      roc_aucs,
      n_lemmas,
     n_important)

<a id='clusters-rf-10'></a>
### 2. Para k = 500, 10 repeticiones con distinta semilla para los clusters

In [None]:
silhouettes = []
roc_aucs = []
n_important_clusters = []
n_lemmas = []
cluster_importances_k = pd.DataFrame()
ntop = 25
k = 500
rseed = 42

for i in tqdm(range(10)):
    seed(rseed)
    a = Clusters_RF(l2v, distance_matrix, filmids, tfidf,
                    k, rseed)
    a.get_clusters()
    a.describe_clusters()
    a.get_silhouette()
    a.get_f2c()
    a.rf()
    
    silhouettes.append(a.silhouette)
    roc_aucs.append(a.roc_auc)
    n_lemmas.append(np.mean(a.clusters.n_lemmas))

    cluster_importances = a.feature_importances.merge(a.clusters,
                                                    how = "left",
                                                    on = "cluster")

    cluster_importances["t"] = i

    # keep just the important clusters
    n_important_clusters.append( np.sum(cluster_importances.Importance > 0) )
    cluster_importances = cluster_importances[cluster_importances.Importance > 0].reset_index(drop = True)


    cluster_importances_k = pd.concat([cluster_importances_k, cluster_importances], axis = 0)

    rseed +=1  # cambia la semilla!


cluster_importances_k.to_pickle(datawd + f"/clusters{k}/cluster_importances_{k}.pkl") 
print("K =", k)
print("mean silhouettes:", silhouettes)
print("mean roc_aucs:", roc_aucs)
print("mean n_lemmas:", np.mean(n_lemmas))
print("n_important_clusters:", n_important_clusters)

<a id='clusters-rf-1'></a>
### 3. Clúster más predictor de inmigración

In [None]:
k = 500
clusters = pd.read_pickle(datawd + f"/clusters{k}/cluster_importances_{k}.pkl")

In [None]:
ntop = 1
top = clusters.groupby("t").head(ntop).sort_values(["t", "Cumulative"]).reset_index(drop = True)
top["Importance_order"] = top.groupby("t")['Importance'].rank(ascending=False).astype(int)
[x for x in top.lemmas]

<a id='clusters-rf-t'></a>
### 4. Comparar particiones y seleccionar T clústers más predictores en cada repetición

In [None]:
clusters = pd.read_pickle(datawd + f"/clusters500/cluster_importances_500.pkl")
ntop = 16
top = clusters.groupby("t").head(ntop).sort_values(["t", "Cumulative"]).reset_index(drop = True)
top["Importance_order"] = top.groupby("t")['Importance'].rank(ascending=False).astype(int)
top.head()


# Comparamos las 10 particiones entre sí
comparison = pd.DataFrame(columns = ["P1","P2", "cluster_P1", "cluster_P2", 
                                     "i_rank_P1" , "i_rank_P2" ,
                                     "intersection" , "mean_cos_sim" ] )

partitions = top['t'].unique()

# Iteramos a lo largo de todas las posibles combinaciones de 2 clústers, uno de cada partición
r = 0
for group1, group2 in tqdm(combinations(partitions, 2)):
    group1_data = top[top['t'] == group1]
    group2_data = top[top['t'] == group2]

    for index1, row1 in group1_data.iterrows():
        for index2, row2 in group2_data.iterrows():
            comparison.loc[r, "P1"] = row1['t']
            comparison.loc[r, "P2"] = row2['t']
            comparison.loc[r, "cluster_P1"] = row1['cluster']
            comparison.loc[r, "cluster_P2"] = row2['cluster']
            comparison.loc[r, "i_rank_P1"] = row1['Importance_order']
            comparison.loc[r, "i_rank_P2"] = row2['Importance_order']
            comparison.loc[r, "intersection"] = prop_intersection(row1['lemmas'],  
                                                                  row2['lemmas'])
            comparison.loc[r, "mean_cos_sim"] = mean_cos_sim(row1['lemmas'], 
                                                             row2['lemmas'],
                                                             l2v)


            r = r + 1

mi = np.min(comparison.mean_cos_sim)
ma = np.max(comparison.mean_cos_sim)
comparison["mean_cos_sim_01"] = (comparison.mean_cos_sim - mi) / (ma - mi)
comparison.to_pickle(datawd + f"/clusters500/comparison_clusters500.pkl")

In [None]:
# Graficamos métricas para distintos valores de T
clusters = pd.read_pickle(datawd + f"/clusters500/cluster_importances_500.pkl")
top = clusters.groupby("t").head(15).sort_values(["t", "Cumulative"]).reset_index(drop = True)
top["Importance_order"] = top.groupby("t")['Importance'].rank(ascending=False).astype(int)

comparison = pd.read_pickle(datawd + f"/clusters500/comparison_clusters500.pkl")

t_top_metrics = pd.DataFrame()

for T in tqdm(np.arange(1,16)):
    importance = top[top.Importance_order <= T].Importance.sum()
    intersection = comparison.loc[(comparison.i_rank_P1 <= T) & (comparison.i_rank_P2 <= T) , "intersection"].mean()
    t = pd.DataFrame.from_dict({"importance_sum" : [importance] ,
                                "intersection_prop" : [intersection] })
    t_top_metrics  = pd.concat([t_top_metrics, t], axis = 0).reset_index(drop = True)

t_top_metrics["T"] = np.arange(1,16) 

# Nos quedamos con 10 clusters
ntop = 10

for c in t_top_metrics.columns[:-1]:
    plt.scatter(t_top_metrics["T"], t_top_metrics[c])
    plt.axvline(x  = ntop, color = "red", alpha = 0.5)
    plt.xlabel("T top clústers")
    plt.ylabel(c)
    plt.ylim(ymin = 0)
    plt.savefig(datawd + f"/clusters{k}/top_clusters_{c}.png", dpi = 300)
    plt.show()

<a id='jerarq'></a>
## Reagrupación vía clustering jerárquico

In [None]:
k= 500
clusters   = pd.read_pickle(datawd + f"/clusters{k}/cluster_importances_{k}.pkl")
comparison = pd.read_pickle(datawd +  f"/clusters{k}/comparison_clusters{k}.pkl")
ntop = 10

top = clusters.groupby("t").head(ntop).sort_values(["t", "Cumulative"]).reset_index(drop = True)
top["Importance_order"] = top.groupby("t")['Importance'].rank(ascending=False).astype(int)
comparison = comparison[(comparison.i_rank_P1 <= ntop) & (comparison.i_rank_P2 <= ntop)]
comparison["c1"] = ("1" + comparison.P1.astype("str") + comparison.cluster_P1.astype(str)).astype("int")
comparison["c2"] = ("1" + comparison.P2.astype("str") + comparison.cluster_P2.astype(str)).astype("int")
cs = np.unique(pd.concat([comparison.c1, comparison.c2]))
idmap = {cs[i]: i for i in range(len(cs))}

comparison["c1id"] = comparison.c1.map(idmap)
comparison["c2id"] = comparison.c2.map(idmap)

top["c"]  = ("1" + top.t.astype("str") + top.cluster.astype(str)).astype("int")
top["cid"] = top.c.map(idmap)


seed(9)
labs = pd.DataFrame()

# hierarchical clustering
for di in ["intersection", "mean_cos_sim_01"]:

    # create distance matrix
    a = comparison[["c1id", "c2id", di]]
    a.columns = ["r", "c" ,"val"]
    b = comparison[["c2id", "c1id", di]]
    b.columns = ["r", "c" ,"val"]
    print(a.shape)
    aux = pd.concat([a, b], axis = 0).reset_index(drop = True)
    del(a, b)
    print(aux.shape)
    aux["val"] = aux.val.astype("float")
    distance_matrix = sp.coo_matrix( ( aux['val'], (aux["r"], aux['c'])) ) ## sparse similarity
    distance_matrix = (1 - distance_matrix.toarray()) ## to distance
    np.fill_diagonal(distance_matrix, 0)
    distance_matrix = squareform(distance_matrix)

    for m in ["complete", "single"]:

        if m + "_" + di !=  "single_mean_cos_sim_01":
            th = 0.7
        if m + "_" + di == "single_mean_cos_sim_01":
            th = 0.45

        l    = hierarchy.linkage(distance_matrix, method=m)  
        labs[m + "_" + di] = hierarchy.fcluster(l, criterion='distance', t = th)


        # Dendogramas
        plt.figure(figsize=(15, 5))
        d = hierarchy.dendrogram(l, orientation='top', 
                                 labels = list(idmap.values()) ,
                                 color_threshold = th )
        plt.title(f"{m}_{di}")
        plt.xlabel('Items')
        plt.ylabel('Distancia')
        plt.savefig(datawd + f"/clusters{k}/dendogram_{m}_{di}.png", dpi = 300)
        plt.show()

labs = labs.reset_index()
labs.rename(columns = {"index" : "cid"}, inplace = True)

# Agregar etiquetas
top = top[['Importance', 'cluster', 'Cumulative', 'n_lemmas', 'lemmas', 't',
       'Importance_order', 'c', 'cid']]

top = top.merge(labs,
                how = "left",
                on = "cid")

comparative_tab = top.groupby("complete_mean_cos_sim_01", as_index = False).agg({"single_intersection"   : "unique",
                                                               "complete_intersection" : "unique",
                                                               "single_mean_cos_sim_01": "unique" })

comparative_tab.to_excel(datawd + f"/clusters{k}/clustering_clusters{k}_comparative.xlsx")
top.to_pickle(datawd + f"/clusters{k}/clustering_clusters{k}_metrics.pkl")


<a id='etiquetas'></a>
## Etiquetas finales

In [None]:
k=500
top = pd.read_pickle(datawd + f"/clusters{k}/clustering_clusters{k}_metrics.pkl")
top_final = top.groupby("complete_mean_cos_sim_01", as_index = False).agg({"lemmas" : list})
top_final["lemmas"] = [np.unique([item for sublist in row for item in sublist]) for row in top_final.lemmas]

manual_grouping = {   "Trabajos bien pagos" :5,
                              "Lugares del mundo": 11,
                              "Lenguage" : 3, 
                               "Nazismo": 9,
                               "Religión, ideología": 6,
                               "Ley inmigratoria": 8,
                               "Metafísica": 6,
                               "Gentilicios":11,
                               "Latino" : 2, 
                               "Lugares en Europa I":  4,
                               "Lugares en Europa II": 4 ,  
                               "Árabe israelí": 7,
                           "Política Estados Unidos": 1,
                           "Nueva York" : 1, 
                            "Británico": 10,
                            "Lugares en Norte América": 1, 
                            "Ley inmigratoria II": 8,
                           "Tecnología": 12,
                           "Latino II" : 2, 
                           "Medio oriente": 7,
                           "Economía comercial": 5,
                            "Moneda": 5}


manual_grouping_names = {1: "Nueva York y Estados Unidos",
                        2: "Latino",
                        3: "Lenguaje",
                        4: "Europa",
                        5: "Economía y empleo",
                        6: "Religión, ideología, cosmovisión",
                        7: "Conflictos medio oriente",
                        8: "Ley inmigratoria",
                        9: "Nazismo",
                        10: "Británico",
                        11: "Gentilicios y lugares del mundo",
                        12: "Tecnología"}


top_final["cluster_name"] = manual_grouping.keys()
top_final["manual_grouping_id"] =  top_final.cluster_name.map(manual_grouping)
top_final["manual_grouping_name"] = top_final.manual_grouping_id.map(manual_grouping_names)

top_final.to_pickle(datawd + "/clusters500/clustering_clusters_names.pkl")

top_final_manual = top_final.groupby("manual_grouping_name", as_index = False).agg({"lemmas" : list})
top_final_manual["lemmas"] = [np.unique([item for sublist in row for item in sublist]) for row in top_final_manual.lemmas]
top_final_manual["f_cluster_id"] = np.arange(0, top_final_manual.shape[0])

top_final_manual.to_pickle(datawd + "/clusters500/final_clusters500.pkl")

In [None]:
top_final_manual["n_lemmas"] = [len(x) for x in top_final_manual.lemmas]
top_final_manual[["manual_grouping_name", "n_lemmas", "lemmas"]].to_excel(datawd + "/clusters500/final_clusters.xlsx")
top_final_manual.n_lemmas.describe()

In [None]:
top_final_manual

In [None]:
# Palabras repetidas entre clusters
words_multiple_clusters = top_final_manual.explode("lemmas").groupby("lemmas").agg({"manual_grouping_name": ["nunique", "unique"]})
words_multiple_clusters.columns = ["n", "clusters"] 
print(words_multiple_clusters[words_multiple_clusters.n > 1])