# GETTING INFORMATION FROM AUTHORS

In [1]:
import json, os
import datetime
from pydantic import HttpUrl
import pandas as pd
from tqdm import tqdm

from pnytter import Pnytter

## 1. Load the Author's list

Obtenemos esta lista, a partir de los datos de los retweets, obteniendo los autores únicos

In [2]:
df_retweets = pd.read_csv('retweets_previo.csv', index_col=None)
df_retweets = df_retweets.rename(columns={'count_star()':'ocurrencias'})

In [3]:
df_retweets.head()

Unnamed: 0,tweet_id,author,author_id,created_at,text,retweets_count,comments_count,likes_count,quotes_count
0,1808104824262594876,fdiaz31,1415640330783891458,2024-07-02 11:45:00+00:00,Pero prefiere el Barcelona. \nFuente: Jota Jordi.,0,0,0,0
1,1806254823139856439,rodrgue_carlos,1138440146,2024-06-27 09:14:00+00:00,Tienes que tener las rodillas en carne viva.,0,0,0,0
2,1805683330618474943,fernandovilabla,236524587,2024-06-25 19:23:00+00:00,Qué triste ser una arrastrada por un hijo medi...,0,0,1,0
3,1807525208267305305,Jorge1941746303,1749504267092639745,2024-06-30 21:22:00+00:00,Un deseo le pido a la vida.. encontrarte por l...,0,0,0,0
4,1807527629567738115,IKERERIK,348680194,2024-06-30 21:32:00+00:00,A ver tontin que han pasado de milagro,0,0,0,0


In [4]:
df_authors = pd.DataFrame(df_retweets.author.value_counts()).reset_index()
df_authors = df_authors.rename(columns={'count':'ocurrencias'})

In [5]:
df_authors.head()

Unnamed: 0,author,ocurrencias
0,v66710974699714,94
1,quinomartinez58,14
2,AE_HencheDiego,6
3,Agc_19827,6
4,Luis22Pedro,5


In [6]:
df_authors.shape

(1326, 2)

## 2. Get the Author's information

In [7]:
localserver = True

# The Pnytter object needs at least 1 Nitter instance to work, but these can be added after initialization
if localserver: 
  pnytter = Pnytter(
    nitter_instances=["http://localhost:8080"]
  )
  print("Local server")
else:
  pnytter = Pnytter(
    nitter_instances=['https://xcancel.com',
  'https://lightbrd.com',
  'https://nitter.lucabased.xyz',
  'https://nitter.space',
  'https://nitter.net',
  'https://nitter.privacyredirect.com',
  'https://nitter.privacydev.net',
  'https://nitter.poast.org',
  'http://nitter.coffee2m3bjsrrqqycx6ghkxrnejl2q6nl7pjw2j4clchjj6uk5zozad.onion']
  )
  print("Remote server")

Local server


Si partimos de datos ya descagados, estos han de estar en un fichero denominado **authors_previo.csv** para seguir descargando datos a partir de estos.

In [8]:
if os.path.exists('authors_previo.csv'):
    df_anterior = pd.read_csv('authors_previo.csv', dtype={'author_id': str})

In [9]:
cont = 0

data = {
    'author_id': [],
    'username': [],
    'fullname': [],
    'biography': [],
    'verified': [],
    'joined_datetime': [],
    'tweets_count': [],
    'following_count': [], 
    'followers_count': [],
    'likes_count': [],
    'profile_picture_url': [],
    'banner_picture_url': []
}

error = {
    'username': [],
}

for row in tqdm(df_authors.itertuples(), total=len(df_authors), desc='Buscando Authors'):
    # check if the tweet was already processed
    if 'df_anterior' in locals():
        if str(row.author) in df_anterior.username.values:
            cont += 1
            continue

    try:
        user = pnytter.find_user(row.author) 
    except Exception as e:
        cont += 1
        #print(f"Error al obtener el user {row.author}: {e}")
        error['username'].append(row.author)
        continue

    if not(user is None):
        try: 
            if user.id != None:
                data['author_id'].append(str(user.id)) 
            else:
                data['author_id'].append('')
            data['fullname'].append(user.fullname)
            data['biography'].append(user.biography)
            data['verified'].append(user.verified)
            if user.joined_datetime != None:
                data['joined_datetime'].append(user.joined_datetime.strftime('%Y-%m-%d %H:%M:%S'))
            else:
                data['joined_datetime'].append(None)
            data['tweets_count'].append(user.stats.tweets)
            data['following_count'].append(user.stats.following)
            data['followers_count'].append(user.stats.followers)
            data['likes_count'].append(user.stats.likes)
            if user.pictures.profile != None:
                data['profile_picture_url'].append(str(user.pictures.profile.twitter_url))
            else:
                data['profile_picture_url'].append(None)
            if user.pictures.banner != None:
                data['banner_picture_url'].append(str(user.pictures.banner.twitter_url))
            else:
                data['banner_picture_url'].append(None)
            data['username'].append(row.author)

        except AttributeError:
            print(f"Error al procesar el user {row.author}: {e}")
            error['username'].append(row.author)

        cont += 1
        if cont == 100:
            cont = 0       
            df = pd.DataFrame(data)
            df_error = pd.DataFrame(error)
            df.to_csv('authors_temp.csv', index=False)
            df_error.to_csv('errors_author_temp.csv', index=False)

df = pd.DataFrame(data)
df_error = pd.DataFrame(error)
df.to_csv('authors.csv', index=False)
df_error.to_csv('errors_author.csv', index=False)
print("Archivos authors.csv y errors_author.csv generados correctamente.")

Buscando Authors: 100%|██████████| 1326/1326 [00:03<00:00, 413.18it/s]

Archivos authors.csv y errors_author.csv generados correctamente.





## Concat de retweets_previo y retweets

Cómo hemos comentado anteriormente, si hemos partido de datos ya descargados, partiremos de retweets_previo, por eso hemos de hacer un concat de retweets_previo y retweets en retweets_final.

In [10]:
if os.path.exists('authors_previo.csv'):
    df_temp = pd.read_csv('authors_previo.csv', dtype={'author_id': str})
    if os.path.exists('authors.csv'):
        df = pd.read_csv('authors.csv', dtype={'author_id': str})
        df_final = pd.concat([df, df_temp], ignore_index=True)
        df_final.to_csv('authors_final.csv', index=False)

In [11]:
print("Datos en authors Previos:", df_temp.shape)
print("Datos descargados en esta ejecución:", df.shape)
print("Datos Totales:", df_final.shape)

Datos en authors Previos: (1325, 12)
Datos descargados en esta ejecución: (1, 12)
Datos Totales: (1326, 12)


In [12]:
# Verificar duplicatas em todas as colunas
duplicated_rows = df_final[df_final.duplicated()]

# Exibir as duplicatas
print(duplicated_rows)

# Contar o número de duplicatas
print(f"Número de duplicatas: {duplicated_rows.shape[0]}")

Empty DataFrame
Columns: [author_id, username, fullname, biography, verified, joined_datetime, tweets_count, following_count, followers_count, likes_count, profile_picture_url, banner_picture_url]
Index: []
Número de duplicatas: 0


In [13]:
# Verificar duplicatas em todas as colunas
duplicated_rows = df[df.duplicated()]

# Exibir as duplicatas
print(duplicated_rows)

# Contar o número de duplicatas
print(f"Número de duplicatas: {duplicated_rows.shape[0]}")

Empty DataFrame
Columns: [author_id, username, fullname, biography, verified, joined_datetime, tweets_count, following_count, followers_count, likes_count, profile_picture_url, banner_picture_url]
Index: []
Número de duplicatas: 0


In [14]:
# Verificar duplicatas em todas as colunas
duplicated_rows = df_temp[df_temp.duplicated()]

# Exibir as duplicatas
print(duplicated_rows)

# Contar o número de duplicatas
print(f"Número de duplicatas: {duplicated_rows.shape[0]}")

Empty DataFrame
Columns: [author_id, username, fullname, biography, verified, joined_datetime, tweets_count, following_count, followers_count, likes_count, profile_picture_url, banner_picture_url]
Index: []
Número de duplicatas: 0


In [15]:
# borramos y renombramos los ficheros resultantes
os.remove('authors.csv')
os.remove('authors_previo.csv')
os.rename('authors_final.csv', 'authors_previo.csv')

In [16]:
df_test = pd.read_csv('authors_previo.csv', dtype={'author_id': str})
print("Datos Finales:", df_test.shape)

Datos Finales: (1326, 12)


In [17]:
df_test.head()

Unnamed: 0,author_id,username,fullname,biography,verified,joined_datetime,tweets_count,following_count,followers_count,likes_count,profile_picture_url,banner_picture_url
0,1.4156403307838915e+18,fdiaz31,FAADZ,,True,2021-07-15 11:52:00,3040,226,144,94823,https://pbs.twimg.com/profile_images/177803622...,https://pbs.twimg.com/profile_banners/14156403...
1,110675227.0,Antonio_H_N,Antonio H.N.,Extremadamente madridista.,True,2010-02-02 11:25:00,26633,653,490,13924,https://pbs.twimg.com/profile_images/165261440...,https://pbs.twimg.com/profile_banners/11067522...
2,1.2605401611480105e+18,Maria44130399,Villana,Orgullosamente bloqueada por Perez Reverte. Fe...,True,2020-05-13 11:59:00,27256,567,138,24200,https://pbs.twimg.com/profile_images/190269703...,https://pbs.twimg.com/profile_banners/12605401...
3,,SheikhAlhajiK1,Sheikh Alhaji K,I Always try to give ppl what they want.,True,2022-02-01 15:54:00,551,60,4,324,https://pbs.twimg.com/profile_images/182474544...,
4,,IsmaelBakkali3,Barçafan,,True,2022-07-16 02:13:00,98,148,3,511,https://pbs.twimg.com/profile_images/172070674...,


In [18]:
df_test[df_test.username=='SiempreFieles14']

Unnamed: 0,author_id,username,fullname,biography,verified,joined_datetime,tweets_count,following_count,followers_count,likes_count,profile_picture_url,banner_picture_url
733,1440246786841399300,SiempreFieles14,Siempre Fieles 15🏆,"SER DEL REAL MADRID NO ES UNA OPCIÓN,ES UN SEN...",True,2021-09-21 09:29:00,21357,4086,4263,57552,https://pbs.twimg.com/profile_images/144024948...,https://pbs.twimg.com/profile_banners/14402467...
