# GETTING INFORMATION FROM AUTHORS

In [1]:
import json, os
import datetime
from pydantic import HttpUrl
import pandas as pd
from tqdm import tqdm

from pnytter import Pnytter

## 1. Load the Author's list

In [2]:
df_retweets = pd.read_csv('202503192044-AUTORES_RETWEETS_ETIQUETADOS_CONTAINS_HATE_TRUE.csv', index_col=None)
df_retweets = df_retweets.rename(columns={'count_star()':'ocurrencias'})

In [3]:
df_retweets.head()

Unnamed: 0,author,ocurrencias
0,1694344217554776065,94
1,714377328869171205,19
2,4045996030,8
3,1595358274647359490,7
4,902242545408905218,6


In [4]:
df_retweets.shape

(1458, 2)

## 2. Get the Author's information

In [5]:
# The Pnytter object needs at least 1 Nitter instance to work, but these can be added after initialization
pnytter = Pnytter(
  #nitter_instances=["http://localhost:8080"]
  nitter_instances=['https://xcancel.com',
 'https://lightbrd.com',
 'https://nitter.lucabased.xyz',
 'https://nitter.space',
 'https://nitter.net',
 'https://nitter.privacyredirect.com',
 'https://nitter.privacydev.net',
 'https://nitter.poast.org',
 'http://nitter.coffee2m3bjsrrqqycx6ghkxrnejl2q6nl7pjw2j4clchjj6uk5zozad.onion']
)

Si partimos de datos ya descagados, estos han de estar en un fichero denominado **retweets_previo.csv** para seguir descargando datos a partir de estos.

In [6]:
if os.path.exists('retweets_previo.csv'):
    df_anterior = pd.read_csv('retweets_previo.csv')

In [22]:
cont = 0

data = {
    'tweet_id': [],
    'author': [],
    'author_id': [],
    'created_at': [],
    'text': [],
    'retweets_count': [], 
    'comments_count': [],
    'likes_count': [],
    'quotes_count': []
}

error = {
    'tweet_id': [],
}

for row in tqdm(df_retweets.itertuples(), total=len(df_retweets), desc='Buscando Retweets'):
    
    # check if the tweet was already processed
    if 'df_anterior' in locals():
        if row.tweet_id in df_anterior.tweet_id.values:
            cont += 1
            continue

    try:
        tweet = pnytter.get_tweet(row.tweet_id)
    except Exception as e:
        cont += 1
        #print(f"Error al obtener el tweet {row.tweet_id}: {e}")
        error['tweet_id'].append(row.tweet_id)
        continue

    try: 
        data['author'].append(tweet.author) 
        data['created_at'].append(tweet.created_on)
        data['text'].append(tweet.text)
        data['retweets_count'].append(tweet.stats.retweets)
        data['comments_count'].append(tweet.stats.comments)
        data['likes_count'].append(tweet.stats.likes)
        data['quotes_count'].append(tweet.stats.quotes)
        data['tweet_id'].append(row.tweet_id)
        data['author_id'].append(row.author)

    except AttributeError:
        error['tweet_id'].append(row.tweet_id)

    cont += 1

    if cont == 100:
        cont = 0       
        df = pd.DataFrame(data)
        df_error = pd.DataFrame(error)
        df.to_csv('retweets_temp.csv', index=False)
        df_error.to_csv('errors_temp.csv', index=False)


df = pd.DataFrame(data)
df_error = pd.DataFrame(error)
df.to_csv('retweets.csv', index=False)
df_error.to_csv('errors.csv', index=False)
print("Archivos retweets.csv y errors.csv generados correctamente.")


Buscando Retweets: 100%|██████████| 1776/1776 [00:57<00:00, 31.01it/s]

Archivos retweets.csv y errors.csv generados correctamente.





## Concat de retweets_previo y retweets

Cómo hemos comentado anteriormente, si hemos partido de datos ya descargados, partiremos de retweets_previo, por eso hemos de hacer un concat de retweets_previo y retweets en retweets_final.

In [23]:
if os.path.exists('retweets_previo.csv'):
    df_temp = pd.read_csv('retweets_previo.csv')
    if os.path.exists('retweets.csv'):
        df = pd.read_csv('retweets.csv')
        df_final = pd.concat([df, df_temp], ignore_index=True)
        df_final.to_csv('retweets_final.csv', index=False)

In [24]:
print("Datos en retweets Previos:", df_temp.shape)
print("Datos descargados en esta ejecución:", df.shape)
print("Datos Totales:", df_final.shape)

Datos en retweets Previos: (1595, 9)
Datos descargados en esta ejecución: (2, 9)
Datos Totales: (1597, 9)


In [25]:
# Verificar duplicatas em todas as colunas
duplicated_rows = df_final[df_final.duplicated()]

# Exibir as duplicatas
print(duplicated_rows)

# Contar o número de duplicatas
print(f"Número de duplicatas: {duplicated_rows.shape[0]}")

Empty DataFrame
Columns: [tweet_id, author, author_id, created_at, text, retweets_count, comments_count, likes_count, quotes_count]
Index: []
Número de duplicatas: 0


In [26]:
# Verificar duplicatas em todas as colunas
duplicated_rows = df[df.duplicated()]

# Exibir as duplicatas
print(duplicated_rows)

# Contar o número de duplicatas
print(f"Número de duplicatas: {duplicated_rows.shape[0]}")

Empty DataFrame
Columns: [tweet_id, author, author_id, created_at, text, retweets_count, comments_count, likes_count, quotes_count]
Index: []
Número de duplicatas: 0


In [27]:
# Verificar duplicatas em todas as colunas
duplicated_rows = df_temp[df_temp.duplicated()]

# Exibir as duplicatas
print(duplicated_rows)

# Contar o número de duplicatas
print(f"Número de duplicatas: {duplicated_rows.shape[0]}")

Empty DataFrame
Columns: [tweet_id, author, author_id, created_at, text, retweets_count, comments_count, likes_count, quotes_count]
Index: []
Número de duplicatas: 0


In [28]:
# borramos y renombramos los ficheros resultantes
os.remove('retweets.csv')
os.remove('retweets_previo.csv')
os.rename('retweets_final.csv', 'retweets_previo.csv')

In [29]:
df_test = pd.read_csv('retweets_previo.csv')
print("Datos Finales:", df_test.shape)

Datos Finales: (1597, 9)
