# GETTING INFORMATION FROM AUTHORS

In [33]:
import json, os
import datetime
from pydantic import HttpUrl
import pandas as pd
from tqdm import tqdm

from pnytter import Pnytter

## 1. Load the Author's list

Obtenemos esta lista, a partir de los datos de los retweets, obteniendo los autores únicos

In [34]:
df_retweets = pd.read_csv('retweets_previo.csv', index_col=None)
df_retweets = df_retweets.rename(columns={'count_star()':'ocurrencias'})

In [35]:
df_retweets.head()

Unnamed: 0,tweet_id,author,author_id,created_at,text,retweets_count,comments_count,likes_count,quotes_count
0,1806254823139856439,rodrgue_carlos,1138440146,2024-06-27 09:14:00+00:00,Tienes que tener las rodillas en carne viva.,0,0,0,0
1,1805683330618474943,fernandovilabla,236524587,2024-06-25 19:23:00+00:00,Qué triste ser una arrastrada por un hijo medi...,0,0,1,0
2,1807525208267305305,Jorge1941746303,1749504267092639745,2024-06-30 21:22:00+00:00,Un deseo le pido a la vida.. encontrarte por l...,0,0,0,0
3,1807527629567738115,IKERERIK,348680194,2024-06-30 21:32:00+00:00,A ver tontin que han pasado de milagro,0,0,0,0
4,1807479914624376932,michel38946321,1588948388204220421,2024-06-30 18:22:00+00:00,Estabas más callado que una movía y enseguida ...,2,0,3,0


In [36]:
df_authors = pd.DataFrame(df_retweets.author.value_counts()).reset_index()
df_authors = df_authors.rename(columns={'count':'ocurrencias'})

In [37]:
df_authors.head()

Unnamed: 0,author,ocurrencias
0,v66710974699714,94
1,quinomartinez58,14
2,Agc_19827,6
3,AE_HencheDiego,6
4,lavanduardia012,5


In [38]:
df_authors.shape

(1325, 2)

## 2. Get the Author's information

In [39]:
localserver = True

# The Pnytter object needs at least 1 Nitter instance to work, but these can be added after initialization
if localserver: 
  pnytter = Pnytter(
    nitter_instances=["http://localhost:8080"]
  )
  print("Local server")
else:
  pnytter = Pnytter(
    nitter_instances=['https://xcancel.com',
  'https://lightbrd.com',
  'https://nitter.lucabased.xyz',
  'https://nitter.space',
  'https://nitter.net',
  'https://nitter.privacyredirect.com',
  'https://nitter.privacydev.net',
  'https://nitter.poast.org',
  'http://nitter.coffee2m3bjsrrqqycx6ghkxrnejl2q6nl7pjw2j4clchjj6uk5zozad.onion']
  )
  print("Remote server")



Local server


Si partimos de datos ya descagados, estos han de estar en un fichero denominado **authors_previo.csv** para seguir descargando datos a partir de estos.

In [40]:
if os.path.exists('authors_previo.csv'):
    df_anterior = pd.read_csv('authors_previo.csv')

In [41]:
cont = 0

data = {
    'author_id': [],
    'username': [],
    'fullname': [],
    'biography': [],
    'verified': [],
    'joined_datetime': [],
    'tweets_count': [],
    'following_count': [], 
    'followers_count': [],
    'likes_count': [],
    'profile_picture_url': [],
    'banner_picture_url': []
}

error = {
    'username': [],
}

for row in tqdm(df_authors.itertuples(), total=len(df_authors), desc='Buscando Authors'):
    
    # check if the tweet was already processed
    if 'df_anterior' in locals():
        if row.author in df_anterior.username.values:
            cont += 1
            continue

    try:
        user = pnytter.find_user(row.author) 
    except Exception as e:
        cont += 1
        #print(f"Error al obtener el user {row.author}: {e}")
        error['username'].append(row.author)
        continue

    if not(user is None):

        try: 
            data['author_id'].append(user.id) 
            data['fullname'].append(user.fullname)
            data['biography'].append(user.biography)
            data['verified'].append(user.verified)
            if user.joined_datetime != None:
                data['joined_datetime'].append(user.joined_datetime.strftime('%Y-%m-%d %H:%M:%S'))
            else:
                data['joined_datetime'].append(None)
            data['tweets_count'].append(user.stats.tweets)
            data['following_count'].append(user.stats.following)
            data['followers_count'].append(user.stats.followers)
            data['likes_count'].append(user.stats.likes)
            if user.pictures.profile != None:
                data['profile_picture_url'].append(str(user.pictures.profile.twitter_url))
            else:
                data['profile_picture_url'].append(None)
            if user.pictures.banner != None:
                data['banner_picture_url'].append(str(user.pictures.banner.twitter_url))
            else:
                data['banner_picture_url'].append(None)
            data['username'].append(row.author)

        except AttributeError:
            print(f"Error al procesar el user {row.author}: {e}")
            error['username'].append(row.author)

        cont += 1

        if cont == 100:
            cont = 0       
            df = pd.DataFrame(data)
            df_error = pd.DataFrame(error)
            df.to_csv('authors_temp.csv', index=False)
            df_error.to_csv('errors_author_temp.csv', index=False)


df = pd.DataFrame(data)
df_error = pd.DataFrame(error)
df.to_csv('authors.csv', index=False)
df_error.to_csv('errors_author.csv', index=False)
print("Archivos authors.csv y errors_author.csv generados correctamente.")


Buscando Authors:   0%|          | 0/1325 [00:00<?, ?it/s]

Buscando Authors: 100%|██████████| 1325/1325 [00:14<00:00, 90.35it/s]

Archivos authors.csv y errors_author.csv generados correctamente.





## Concat de retweets_previo y retweets

Cómo hemos comentado anteriormente, si hemos partido de datos ya descargados, partiremos de retweets_previo, por eso hemos de hacer un concat de retweets_previo y retweets en retweets_final.

In [42]:
if os.path.exists('authors_previo.csv'):
    df_temp = pd.read_csv('authors_previo.csv')
    if os.path.exists('authors.csv'):
        df = pd.read_csv('authors.csv')
        df_final = pd.concat([df, df_temp], ignore_index=True)
        df_final.to_csv('authors_final.csv', index=False)

In [43]:
print("Datos en authors Previos:", df_temp.shape)
print("Datos descargados en esta ejecución:", df.shape)
print("Datos Totales:", df_final.shape)

Datos en authors Previos: (1320, 12)
Datos descargados en esta ejecución: (5, 12)
Datos Totales: (1325, 12)


In [44]:
# Verificar duplicatas em todas as colunas
duplicated_rows = df_final[df_final.duplicated()]

# Exibir as duplicatas
print(duplicated_rows)

# Contar o número de duplicatas
print(f"Número de duplicatas: {duplicated_rows.shape[0]}")

Empty DataFrame
Columns: [author_id, username, fullname, biography, verified, joined_datetime, tweets_count, following_count, followers_count, likes_count, profile_picture_url, banner_picture_url]
Index: []
Número de duplicatas: 0


In [45]:
# Verificar duplicatas em todas as colunas
duplicated_rows = df[df.duplicated()]

# Exibir as duplicatas
print(duplicated_rows)

# Contar o número de duplicatas
print(f"Número de duplicatas: {duplicated_rows.shape[0]}")

Empty DataFrame
Columns: [author_id, username, fullname, biography, verified, joined_datetime, tweets_count, following_count, followers_count, likes_count, profile_picture_url, banner_picture_url]
Index: []
Número de duplicatas: 0


In [46]:
# Verificar duplicatas em todas as colunas
duplicated_rows = df_temp[df_temp.duplicated()]

# Exibir as duplicatas
print(duplicated_rows)

# Contar o número de duplicatas
print(f"Número de duplicatas: {duplicated_rows.shape[0]}")

Empty DataFrame
Columns: [author_id, username, fullname, biography, verified, joined_datetime, tweets_count, following_count, followers_count, likes_count, profile_picture_url, banner_picture_url]
Index: []
Número de duplicatas: 0


In [47]:
# borramos y renombramos los ficheros resultantes
os.remove('authors.csv')
os.remove('authors_previo.csv')
os.rename('authors_final.csv', 'authors_previo.csv')

In [48]:
df_test = pd.read_csv('authors_previo.csv')
print("Datos Finales:", df_test.shape)

Datos Finales: (1325, 12)


In [50]:
df_test.head()

Unnamed: 0,author_id,username,fullname,biography,verified,joined_datetime,tweets_count,following_count,followers_count,likes_count,profile_picture_url,banner_picture_url
0,1.624806e+18,MrsGaliano,el Tio de la Alpargata,Defender tu partido político NO es defender a ...,True,2023-02-12 16:21:00,12128,442,159,20378,https://pbs.twimg.com/profile_images/162528661...,https://pbs.twimg.com/profile_banners/16248057...
1,1365578000.0,LomasSan,santilzzz,QUE LE VOY A HACER SI TENGO LA MENTE SUCIA😈😈 E...,True,2013-04-19 22:56:00,13950,1285,340,181290,https://pbs.twimg.com/profile_images/916305598...,https://pbs.twimg.com/profile_banners/13655784...
2,1.039863e+18,Alaiintxo,Alain,2024-04-06,True,2018-09-12 13:05:00,5413,231,101,23586,https://pbs.twimg.com/profile_images/186399114...,https://pbs.twimg.com/profile_banners/10398625...
3,1.247075e+18,EHermafrodito,ElSeñoritoH,La gente se toma tuiter demasiado en serio. Lo...,True,2020-04-06 08:13:00,57179,1794,1946,122356,https://pbs.twimg.com/profile_images/176674614...,https://pbs.twimg.com/profile_banners/12470748...
4,,GonrodMarilu,Mardecastilla,,True,2023-02-03 21:12:00,70066,410,99,14,,
