# GETTING INFORMATION FROM RETWEETS

In [16]:
import json, os
import datetime
from pydantic import HttpUrl
import pandas as pd
from tqdm import tqdm

from pnytter import Pnytter

## 1. Load the Retweets list

In [17]:
df_retweets = pd.read_csv('202503192227-RETWEETS_ETIQUETADOS_CONTAINS_HATE_TRUE.csv', index_col=None)
df_retweets = df_retweets.rename(columns={'count_star()':'ocurrencias'})

In [18]:
df_retweets.head()

Unnamed: 0,tweet_id,author,ocurrencias
0,1795781220925669620,963492493458329602,2
1,1795383660088074396,796158301788667908,2
2,1795168528464540035,482147864,2
3,1798852393096130705,1621152552258801672,2
4,1801196737324843027,1665677569671168002,2


In [19]:
df_retweets.shape

(1776, 3)

## 2. Get the Retweets's information

In [20]:
localserver = True

# The Pnytter object needs at least 1 Nitter instance to work, but these can be added after initialization
if localserver: 
  pnytter = Pnytter(
    nitter_instances=["http://localhost:8080"]
  )
  print("Local server")
else:
  pnytter = Pnytter(
    nitter_instances=['https://xcancel.com',
  'https://lightbrd.com',
  'https://nitter.lucabased.xyz',
  'https://nitter.space',
  'https://nitter.net',
  'https://nitter.privacyredirect.com',
  'https://nitter.privacydev.net',
  'https://nitter.poast.org',
  'http://nitter.coffee2m3bjsrrqqycx6ghkxrnejl2q6nl7pjw2j4clchjj6uk5zozad.onion']
  )
  print("Remote server")

Local server


Si partimos de datos ya descagados, estos han de estar en un fichero denominado **retweets_previo.csv** para seguir descargando datos a partir de estos.

In [21]:
if os.path.exists('retweets_previo.csv'):
    df_anterior = pd.read_csv('retweets_previo.csv')

In [22]:
cont = 0

data = {
    'tweet_id': [],
    'author': [],
    'author_id': [],
    'created_at': [],
    'text': [],
    'retweets_count': [], 
    'comments_count': [],
    'likes_count': [],
    'quotes_count': []
}

error = {
    'tweet_id': [],
}

for row in tqdm(df_retweets.itertuples(), total=len(df_retweets), desc='Buscando Retweets'):
    
    # check if the tweet was already processed
    if 'df_anterior' in locals():
        if row.tweet_id in df_anterior.tweet_id.values:
            cont += 1
            continue

    try:
        tweet = pnytter.get_tweet(row.tweet_id)
    except Exception as e:
        cont += 1
        #print(f"Error al obtener el tweet {row.tweet_id}: {e}")
        error['tweet_id'].append(row.tweet_id)
        continue

    try: 
        data['author'].append(tweet.author) 
        data['created_at'].append(tweet.created_on)
        data['text'].append(tweet.text)
        data['retweets_count'].append(tweet.stats.retweets)
        data['comments_count'].append(tweet.stats.comments)
        data['likes_count'].append(tweet.stats.likes)
        data['quotes_count'].append(tweet.stats.quotes)
        data['tweet_id'].append(row.tweet_id)
        data['author_id'].append(row.author)

    except AttributeError:
        #print(f"Error al procesar el tweet {row.tweet_id}: {e}")
        error['tweet_id'].append(row.tweet_id)

    cont += 1

    if cont == 100:
        cont = 0       
        df = pd.DataFrame(data)
        df_error = pd.DataFrame(error)
        df.to_csv('retweets_temp.csv', index=False)
        df_error.to_csv('errors_temp.csv', index=False)


df = pd.DataFrame(data)
df_error = pd.DataFrame(error)
df.to_csv('retweets.csv', index=False)
df_error.to_csv('errors.csv', index=False)
print("Archivos retweets.csv y errors.csv generados correctamente.")


Buscando Retweets: 100%|██████████| 1776/1776 [06:06<00:00,  4.85it/s]

Archivos retweets.csv y errors.csv generados correctamente.





## Concat de retweets_previo y retweets

Cómo hemos comentado anteriormente, si hemos partido de datos ya descargados, partiremos de retweets_previo, por eso hemos de hacer un concat de retweets_previo y retweets en retweets_final.

In [23]:
if os.path.exists('retweets_previo.csv'):
    df_temp = pd.read_csv('retweets_previo.csv')
    if os.path.exists('retweets.csv'):
        df = pd.read_csv('retweets.csv')
        df_final = pd.concat([df, df_temp], ignore_index=True)
        df_final.to_csv('retweets_final.csv', index=False)

In [24]:
print("Datos en retweets Previos:", df_temp.shape)
print("Datos descargados en esta ejecución:", df.shape)
print("Datos Totales:", df_final.shape)

Datos en retweets Previos: (1624, 9)
Datos descargados en esta ejecución: (0, 9)
Datos Totales: (1624, 9)


In [25]:
# Verificar duplicatas em todas as colunas
duplicated_rows = df_final[df_final.duplicated()]

# Exibir as duplicatas
print(duplicated_rows)

# Contar o número de duplicatas
print(f"Número de duplicatas: {duplicated_rows.shape[0]}")

Empty DataFrame
Columns: [tweet_id, author, author_id, created_at, text, retweets_count, comments_count, likes_count, quotes_count]
Index: []
Número de duplicatas: 0


In [26]:
# Verificar duplicatas em todas as colunas
duplicated_rows = df[df.duplicated()]

# Exibir as duplicatas
print(duplicated_rows)

# Contar o número de duplicatas
print(f"Número de duplicatas: {duplicated_rows.shape[0]}")

Empty DataFrame
Columns: [tweet_id, author, author_id, created_at, text, retweets_count, comments_count, likes_count, quotes_count]
Index: []
Número de duplicatas: 0


In [27]:
# Verificar duplicatas em todas as colunas
duplicated_rows = df_temp[df_temp.duplicated()]

# Exibir as duplicatas
print(duplicated_rows)

# Contar o número de duplicatas
print(f"Número de duplicatas: {duplicated_rows.shape[0]}")

Empty DataFrame
Columns: [tweet_id, author, author_id, created_at, text, retweets_count, comments_count, likes_count, quotes_count]
Index: []
Número de duplicatas: 0


In [28]:
# borramos y renombramos los ficheros resultantes
os.remove('retweets.csv')
os.remove('retweets_previo.csv')
os.rename('retweets_final.csv', 'retweets_previo.csv')

In [29]:
df_test = pd.read_csv('retweets_previo.csv')
print("Datos Finales:", df_test.shape)

Datos Finales: (1624, 9)


In [30]:
df_test.sort_values(by='likes_count', ascending=False).head(10)

Unnamed: 0,tweet_id,author,author_id,created_at,text,retweets_count,comments_count,likes_count,quotes_count
1618,1805031024553853082,jorgitomaduixas,579297067,2024-06-24 00:11:00+00:00,"No es futbol, és política, se trata de catalan...",22,25,302,8
1015,1804311342439858354,republicatransg,1559972449579683841,2024-06-22 00:32:00+00:00,Que tú mantengas las piernas cerradas delante ...,37,36,299,1
121,1808078694595862978,Hsa1981_02,1466095115060002832,2024-07-02 10:02:00+00:00,"Pero, ¿ Ya habéis pagado las nóminas ? , ¿ Ya ...",14,19,275,0
1553,1801199879458341013,esmultivac,158730655,2024-06-13 10:28:00+00:00,Quieres decir que tienes que repetir siempre l...,16,7,157,0
1603,1803920415670309221,10InfoBlaugrana,1313101153970917378,2024-06-20 22:38:00+00:00,Ahora no te subas al barco payaso,0,1,100,0
1525,1807156622797717558,StalingradolaB,2254499835,2024-06-29 20:58:00+00:00,No tienes vergüenza.\r\nNos habéis privado de ...,22,2,97,0
292,1808108166959264099,DaniCuevasLpz,417499634,2024-07-02 11:59:00+00:00,Y vosotros os lo creéis.\r\n\r\nParece mentira...,0,6,70,0
68,1808109156584296936,JordiMajoRuaix,1342079120893763585,2024-07-02 12:03:00+00:00,Vaya información más absurda además de poco cr...,2,5,62,1
685,1807893863723373021,Gelito_RM,1694073790228058112,2024-07-01 21:47:00+00:00,Carme con las arrugas que me llevas estas tu p...,0,0,51,0
623,1805109486983520275,Nachete82,183635768,2024-06-24 05:23:00+00:00,Si me hablases de continentes te lo puedo comp...,0,1,41,1
