In [1]:
import pandas as pd
import numpy as np
import os
from random import randint
from tqdm import tqdm
#import tensorflow as tf
#import tensorflow.experimental.numpy as tnp
import pickle
import random
from typing import Callable

#conda deactivate
#conda activate tf

In [2]:
#tnp.experimental_enable_numpy_behavior()

In [4]:
tweets_pd = pd.read_csv('./tweets_2022_abril_junio.csv')

In [None]:
def filter_tweets(data: str) -> str:
    punctuations = {".",",","!","'",":"}
    if data[0:2] == "RT":  # eliminamos el RT y el usuario asociado
        try:
            data = data.split(":")[1]
        except IndexError:
            data = data
    for p in punctuations:
        data = data.replace(p,"")
    data = data.encode('ascii', 'ignore').decode('ascii')
    data = data.lower()
    if 'https' in data or len(data) < 15:
        data = pd.NA
    return data

tweets_pd["text"] = tweets_pd["text"].apply(filter_tweets)

In [None]:
tweets_pd.dropna(inplace=True)
tweets_pd = tweets_pd[["screen_name", "text"]].reset_index(drop=True)

In [None]:
len(tweets_pd)

3153403

In [None]:
user_tweet_count = tweets_pd.groupby("screen_name").count().reset_index()

In [None]:
# drop users with less than 20 tweets from tweets_pd
tweets_pd = tweets_pd[tweets_pd["screen_name"].isin(user_tweet_count[user_tweet_count["text"] >= 20]["screen_name"])].reset_index(drop=True)

In [None]:
# count number of tweets per user
user_tweet_count = tweets_pd.groupby("screen_name").count().reset_index()
# rename tweet count column
user_tweet_count.rename(columns={"text": "tweet_count"}, inplace=True)

with open("users.pkl", "wb") as f:
    pickle.dump(user_tweet_count, f)

In [None]:
tweet_shingles = {}
k = 4 # shingle size
s = 0.5 # similarity threshold (0-1)

BSIZE = 500000

counter = 0
for i in range(0, len(tweets_pd), BSIZE):
    temp = tweets_pd.iloc[i:i+BSIZE].copy()
    temp["shingles"] = [set([tweet[i:i+k] for i in range(len(tweet) - k + 1)]) for tweet in tqdm(temp["text"])]
    temp = temp[["screen_name", "shingles"]].reset_index(drop=True)
    with open(f"./shingle_batch_{counter}.pkl", "wb") as f:
        pickle.dump(temp, f)
    counter += 1
    del temp

100%|██████████| 500000/500000 [00:19<00:00, 25080.31it/s]
100%|██████████| 500000/500000 [00:18<00:00, 26451.99it/s]
100%|██████████| 500000/500000 [00:21<00:00, 23391.63it/s]
100%|██████████| 500000/500000 [00:19<00:00, 26282.16it/s]
100%|██████████| 500000/500000 [00:19<00:00, 25660.23it/s]
100%|██████████| 500000/500000 [00:19<00:00, 25062.45it/s]
100%|██████████| 153403/153403 [00:04<00:00, 30687.70it/s]


In [None]:
shingles = set()

for i in tqdm(range(0, 7)):
    with open(f"shingle_batch_{i}.pkl", "rb") as f:
        temp = pickle.load(f)
    shingles = shingles.union(set.union(*temp["shingles"]))

100%|██████████| 7/7 [02:34<00:00, 22.05s/it]


In [None]:
print(len(shingles))

264710


In [None]:
with open(f"./shingles.pkl", "wb") as f:
    pickle.dump(shingles, f)

In [2]:
SHINGLES = 264710
TWEETS = 3153403

In [5]:
with open(f"./shingles.pkl", "rb") as f:
    temp = pickle.load(f)

In [3]:
def jaccard_similarity(s1: set, s2: set) -> float:
    inetersection = s1.intersection(s2)
    union = s1.union(s2)
    return len(inetersection) / len(union)

In [4]:
def crear_hash(a: int, b: int, p: int, n: int) -> Callable:
    def f(x: int) -> int:
        return ((a * x + b) % p) % n
    return f

h = []
n = SHINGLES
p = 20063
num_hash = 50
for i in range(num_hash):
    a = random.randint(1,p-1)
    b = random.randint(1,p-1)
    h.append(crear_hash(a,b,p,n))

In [10]:
# precompute hash values
hash_values = {}
with open(f"./shingles.pkl", "rb") as f:
    shingles = pickle.load(f)
for shingle in tqdm(shingles):
    hash_values[shingle] = [h[i](hash(shingle)) for i in range(num_hash)]
with open(f"./hash_values.pkl", "wb") as f:
    pickle.dump(hash_values, f)

100%|██████████| 264710/264710 [00:06<00:00, 37952.47it/s]


In [None]:
%%script echo skipping
users_auto_similarity = {}

with open("./users.pickle", "rb") as f:
    users = pickle.load(f)

with open(f"./hash_values.pkl", "rb") as f:
    hash_values = pickle.load(f)

def shignle_to_hash(shingle: str) -> list:
    return min([hash_values[shingle][i] for i in range(num_hash)])

for user in users["screen_name"]:
    print(f"Computing autosimilarity {user}")
    cols = num_hash
    rows = users[users["screen_name"] == user]["tweet_count"].values[0]
    signature_matrix = np.full((rows, cols), np.inf)
    count = 0
    for i in tqdm(range(0, 7)):
        with open(f"shingle_batch_{i}.pkl", "rb") as f:
            temp = pickle.load(f)
        temp = temp[temp["screen_name"] == user].reset_index(drop=True)
        for j in range(len(temp)):
            for k in range(num_hash):
                for shingle in temp["shingles"][j]:
                    signature_matrix[count][k] = min(signature_matrix[count][k], hash_values[shingle][k])
            count += 1

    with open(f"./signature_matrix_{user}.pkl", "wb") as f:
        pickle.dump(signature_matrix, f)
    # compute similarity between tweets
    similarity_matrix = np.zeros((cols, cols))
    for i in range(cols):
        for j in range(cols):
            similarity_matrix[i][j] = jaccard_similarity(set(signature_matrix[:,i]), set(signature_matrix[:,j]))
    # get average similarity
    similarity = np.mean(similarity_matrix)
    users_auto_similarity[user] = similarity
    with open(f"./users_auto_similarity.pkl", "wb") as f:
        pickle.dump(users_auto_similarity, f)

Couldn't find program: 'echo'


In [9]:
users_auto_similarity = {}
users_similarity_matrix = {}

with open("./users.pkl", "rb") as f:
    users = pickle.load(f)

with open(f"./hash_values.pkl", "rb") as f:
    hash_values = pickle.load(f)

def shignle_to_hash(shingle: str) -> list:
    return min([hash_values[shingle][i] for i in range(num_hash)])

user_prev_count = {}
for user in users["screen_name"]:
    user_prev_count[user] = 0

cols = num_hash
for i in tqdm(range(0, 7)):
    with open(f"shingle_batch_{i}.pkl", "rb") as f:
        temp = pickle.load(f)
    for user in users["screen_name"]:
        rows = users[users["screen_name"] == user]["tweet_count"].values[0]
        #print(f"Computing autosimilarity {user}, iteration {i}")
        try:
            with open(f"./signature_matrix_{user}.pkl", "rb") as f:
                signature_matrix = pickle.load(f)
        except FileNotFoundError:
            signature_matrix = np.full((rows, cols), np.inf)

        count = user_prev_count[user]
        temp = temp[temp["screen_name"] == user].reset_index(drop=True)
        for j in range(len(temp)):
            for k in range(num_hash):
                for shingle in temp["shingles"][j]:
                    signature_matrix[count][k] = min(signature_matrix[count][k], hash_values[shingle][k])
            count += 1
        user_prev_count[user] = count

        with open(f"./signature_matrix_{user}.pkl", "wb") as f:
            pickle.dump(signature_matrix, f)
        # compute similarity between tweets
        similarity_matrix = np.zeros((rows, cols))
        for ii in range(rows):
            for iii in range(cols):
                similarity_matrix[ii][iii] = jaccard_similarity(set(signature_matrix[:,ii]), set(signature_matrix[:,iii]))
        # get average similarity
        similarity = np.mean(similarity_matrix)
        users_auto_similarity[user] = similarity
        users_similarity_matrix[user] = similarity_matrix

100%|██████████| 7/7 [54:03<00:00, 463.39s/it]


In [10]:
with open("users_similarity_matrix.pkl", "wb") as f:
    pickle.dump(users_similarity_matrix, f)

with open("users_auto_similarity.pkl", "wb") as f:
    pickle.dump(users_auto_similarity, f)

In [12]:
users_auto_similarity = pd.DataFrame.from_dict(users_auto_similarity, orient="index", columns=["auto_similarity"])

In [14]:
users_auto_similarity.describe()

Unnamed: 0,auto_similarity
count,31062.0
mean,0.999971
std,0.005038
min,0.112022
25%,1.0
50%,1.0
75%,1.0
max,1.0


In [None]:
with open("./users.pkl", "rb") as f:
    users = pickle.load(f)
    
cols = num_hash
rows = users[users["screen_name"] == user]["tweet_count"].values[0]

In [None]:
users_auto_similarity = {}
users_similarity_matrix = {}

for user in tqdm(users["screen_name"]):
    with open(f"./signature_matrices/signature_matrix_{user}.pkl", "rb") as f:
        signature_matrix = pickle.load(f)
    # compute similarity between tweets
    similarity_matrix = np.zeros((cols, cols))
    for i in range(cols):
        for j in range(cols):
            similarity_matrix[i][j] = jaccard_similarity(set(signature_matrix[:,i]), set(signature_matrix[:,j]))
    # get average similarity
    similarity = np.mean(similarity_matrix)
    users_auto_similarity[user] = similarity
    users_similarity_matrix[user] = similarity_matrix

100%|██████████| 31062/31062 [28:59<00:00, 17.86it/s] 


La idea de esto es la siguiente:

1. Obtener una "auto-similitud" de Jaccard:
   1. Para cada usuario, obtener su matriz minhash
   2. Calcular la similitud de Jaccard de un usuario con si mismo como el promedio de sus similitudes
   3. Esto debería entregar los usuarios más "consistentes"
2. Tomar a los X usuarios más consistentens
3. Calcular la similitud de Jaccard entre ellos a partir de muestras.
4. Hacer 5 grupos de 5 usuarios similares cada uno.

In [13]:
users_auto_similarity.describe()

Unnamed: 0,auto_similarity
count,31062.0
mean,0.999971
std,0.005038
min,0.112022
25%,1.0
50%,1.0
75%,1.0
max,1.0


Y la estrategia no sirvió de mucho, ya que la amplia mayoría de los usuarios escriben parecido a ellos mismos, que en retrospectiva es algo obvio.

Para poder obtener alguna conclusión a partir de estos datos, vamos a probar que ocurre si pasamos la matriz de similitud de cada usuario al dominio de las frecuencias y ver que tanto se parecen sus curvas.