In [26]:
import pandas as pd
import json
import re
import string
from pathlib import Path
from collections import defaultdict

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords_es = set(stopwords.words('spanish'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\thiag\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
data_path = Path("data/traficogt.txt")

raw = data_path.read_bytes()

is_utf16 = raw.startswith(b'\xff\xfe') or raw.startswith(b'\xfe\xff')
encoding = "utf-16" if is_utf16 else "utf-8"

text = raw.decode(encoding, errors="replace")

lines = [ln.strip() for ln in text.split("\r\n") if ln.strip()]

tweets = []
bad = []  

for i, ln in enumerate(lines, start=1):
    try:
        tweets.append(json.loads(ln))
    except Exception as e:
        bad.append((i, str(e)))

print(f"Tweets cargados: {len(tweets)}")
print(f"Líneas descartadas: {len(bad)}")

Tweets cargados: 5604
Líneas descartadas: 1


In [28]:
tweets[0]

{'id': 1834236045598056867,
 'id_str': '1834236045598056867',
 'url': 'https://x.com/traficogt/status/1834236045598056867',
 'date': '2024-09-12 14:22:06+00:00',
 'user': {'id': 93938886,
  'id_str': '93938886',
  'url': 'https://x.com/traficogt',
  'username': 'traficogt',
  'displayname': 'traficoGT',
  'rawDescription': 'Noticias de ciudad de Guatemala',
  'created': '2009-12-01 20:42:19+00:00',
  'followersCount': 314368,
  'friendsCount': 137,
  'statusesCount': 52385,
  'favouritesCount': 3471,
  'listedCount': 291,
  'mediaCount': 1292,
  'location': 'Guatemala',
  'profileImageUrl': 'https://pbs.twimg.com/profile_images/1782036597841530880/-tVuhOdK_normal.jpg',
  'profileBannerUrl': None,
  'protected': None,
  'verified': False,
  'blue': False,
  'blueType': None,
  'descriptionLinks': [],
  'pinnedIds': [],
  '_type': 'snscrape.modules.twitter.User'},
 'lang': 'es',
 'rawContent': 'Es comprensible la resolución... El ruso sabe de engrasar maquinaria.',
 'replyCount': 0,
 're

In [29]:
df = pd.json_normalize(tweets)

cols = ["id", "date", "user.username", "user.id", "rawContent", 
        "replyCount", "retweetCount", "likeCount", "quoteCount",
        "mentionedUsers"]

df = df[cols]
df

Unnamed: 0,id,date,user.username,user.id,rawContent,replyCount,retweetCount,likeCount,quoteCount,mentionedUsers
0,1834236045598056867,2024-09-12 14:22:06+00:00,traficogt,93938886,Es comprensible la resolución... El ruso sabe ...,0,0,1,0,[]
1,1834029142565658846,2024-09-12 00:39:56+00:00,monymmorales,976875408,La corrupción de la @CC_Guatemala\nes descarad...,0,56,84,4,"[{'id': 783345301256073216, 'id_str': '7833453..."
2,1834039491826180424,2024-09-12 01:21:04+00:00,animaldgalaccia,1730828822029750272,@PNCdeGuatemala @mingobguate @FJimenezmingob @...,0,0,1,0,"[{'id': 130315077, 'id_str': '130315077', 'use..."
3,1833963729136091179,2024-09-11 20:20:01+00:00,EstacionDobleA,1802661334355456000,@amilcarmontejo @AztecaNoticiaGT @BancadaSemil...,0,0,0,0,"[{'id': 372126670, 'id_str': '372126670', 'use..."
4,1833665391698092330,2024-09-11 00:34:31+00:00,CubReserva,1155617398675988481,@soy_502 @AztecaNoticiaGT @CONAPgt @DenunciaEM...,0,0,1,0,"[{'id': 1687984068, 'id_str': '1687984068', 'u..."
...,...,...,...,...,...,...,...,...,...,...
5599,1711138940990722120,2023-10-08 21:58:09+00:00,guiselabarrios,27352856,@traficogt Y no que presentando 5 mil firmas a...,0,0,0,0,"[{'id': 93938886, 'id_str': '93938886', 'usern..."
5600,1711133805182869820,2023-10-08 21:37:45+00:00,mvtrooper,591424023,@hshetemul @traficogt Y de igual Manera quitan...,0,0,0,0,"[{'id': 633615711, 'id_str': '633615711', 'use..."
5601,1711133697552810362,2023-10-08 21:37:19+00:00,elmeronene1,1572301195032625152,@traficogt Es algo más fuerte que ellos no qui...,0,0,0,0,"[{'id': 93938886, 'id_str': '93938886', 'usern..."
5602,1711132207631212797,2023-10-08 21:31:24+00:00,mvtrooper,591424023,@Factor4_GT @traficogt @CC_Guatemala @MPguatem...,0,0,0,0,"[{'id': 1241496971678015489, 'id_str': '124149..."


In [30]:
def clean_text(text):
    # Minúsculas
    text = text.lower()
    # Quitar urls
    text = re.sub(r"http\S+|www\S+", "", text)
    # Quitar menciones y hashtags
    text = re.sub(r"[@#]\w+", "", text)
    # Quitar emojis y caracteres no alfanuméricos básicos
    text = re.sub(r"[^\w\sáéíóúüñ]", "", text)
    # Quitar números
    text = re.sub(r"\d+", "", text)
    # Quitar stopwords
    words = [w for w in text.split() if w not in stopwords_es]
    return " ".join(words)

df["clean_text"] = df["rawContent"].astype(str).apply(clean_text)
df[["rawContent", "clean_text"]].head()

Unnamed: 0,rawContent,clean_text
0,Es comprensible la resolución... El ruso sabe ...,comprensible resolución ruso sabe engrasar maq...
1,La corrupción de la @CC_Guatemala\nes descarad...,corrupción descarada falsificación documentos ...
2,@PNCdeGuatemala @mingobguate @FJimenezmingob @...,
3,@amilcarmontejo @AztecaNoticiaGT @BancadaSemil...,
4,@soy_502 @AztecaNoticiaGT @CONAPgt @DenunciaEM...,urgente zona deterioro tala inmoderada tráfico...


In [31]:
# Eliminar duplicados por id de tweet
df = df.drop_duplicates(subset=["id"])

# Normalizar usernames a minúsculas
df["user.username"] = df["user.username"].str.lower()

# Normalizar menciones
def normalize_mentions(m):
    if isinstance(m, list):
        return [x["username"].lower() for x in m if "username" in x]
    return []
df["mentions"] = df["mentionedUsers"].apply(normalize_mentions)

df[["user.username", "mentions"]].head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["user.username"] = df["user.username"].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["mentions"] = df["mentionedUsers"].apply(normalize_mentions)


Unnamed: 0,user.username,mentions
0,traficogt,[]
1,monymmorales,[cc_guatemala]
2,animaldgalaccia,"[pncdeguatemala, mingobguate, fjimenezmingob, ..."
3,estaciondoblea,"[amilcarmontejo, aztecanoticiagt, bancadasemil..."
4,cubreserva,"[soy_502, aztecanoticiagt, conapgt, denunciaem..."


In [32]:
from zoneinfo import ZoneInfo 
df["datetime_utc"] = pd.to_datetime(df["date"], utc=True, errors="coerce")

# 2) Conversión a la zona horaria de Guatemala
TZ_GT = ZoneInfo("America/Guatemala")
df["datetime_gt"] = df["datetime_utc"].dt.tz_convert(TZ_GT)

df[["date", "datetime_utc", "datetime_gt"]].head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["datetime_utc"] = pd.to_datetime(df["date"], utc=True, errors="coerce")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["datetime_gt"] = df["datetime_utc"].dt.tz_convert(TZ_GT)


Unnamed: 0,date,datetime_utc,datetime_gt
0,2024-09-12 14:22:06+00:00,2024-09-12 14:22:06+00:00,2024-09-12 08:22:06-06:00
1,2024-09-12 00:39:56+00:00,2024-09-12 00:39:56+00:00,2024-09-11 18:39:56-06:00
2,2024-09-12 01:21:04+00:00,2024-09-12 01:21:04+00:00,2024-09-11 19:21:04-06:00


In [33]:
# Strings legibles
df["fecha_gt"] = df["datetime_gt"].dt.strftime("%Y-%m-%d")
df["hora_gt"]  = df["datetime_gt"].dt.strftime("%H:%M:%S")

# Derivados útiles para análisis
df["anio"]        = df["datetime_gt"].dt.year
df["mes_num"]     = df["datetime_gt"].dt.month
df["dia"]         = df["datetime_gt"].dt.day
df["hora"]        = df["datetime_gt"].dt.hour
df["semana_iso"]  = df["datetime_gt"].dt.isocalendar().week

# Nombres en español
try:
    df["mes_nombre"]   = df["datetime_gt"].dt.month_name(locale="es_ES")
    df["dia_semana"]   = df["datetime_gt"].dt.day_name(locale="es_ES")
except Exception:
    MES = {1:"enero",2:"febrero",3:"marzo",4:"abril",5:"mayo",6:"junio",
           7:"julio",8:"agosto",9:"septiembre",10:"octubre",11:"noviembre",12:"diciembre"}
    DIA = {0:"lunes",1:"martes",2:"miércoles",3:"jueves",4:"viernes",5:"sábado",6:"domingo"}
    df["mes_nombre"] = df["mes_num"].map(MES)
    df["dia_semana"] = df["datetime_gt"].dt.weekday.map(DIA)

# Timestamp (segundos UNIX) para joins y ordenamiento eficiente
try:
    df["timestamp_unix"] = (df["datetime_utc"].astype("int64") // 10**9)
except Exception:
    df["timestamp_unix"] = pd.to_numeric(df["datetime_utc"].view("int64"), errors="coerce") // 10**9

df[["fecha_gt","hora_gt","anio","mes_num","mes_nombre","dia_semana","hora","timestamp_unix"]].head(5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["fecha_gt"] = df["datetime_gt"].dt.strftime("%Y-%m-%d")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["hora_gt"]  = df["datetime_gt"].dt.strftime("%H:%M:%S")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["anio"]        = df["datetime_gt"].dt.year
A value is trying to be set on a copy of 

Unnamed: 0,fecha_gt,hora_gt,anio,mes_num,mes_nombre,dia_semana,hora,timestamp_unix
0,2024-09-12,08:22:06,2024,9,Septiembre,Jueves,8,1726150926
1,2024-09-11,18:39:56,2024,9,Septiembre,Miércoles,18,1726101596
2,2024-09-11,19:21:04,2024,9,Septiembre,Miércoles,19,1726104064
3,2024-09-11,14:20:01,2024,9,Septiembre,Miércoles,14,1726086001
4,2024-09-10,18:34:31,2024,9,Septiembre,Martes,18,1726014871


In [34]:
front_cols = [
    "fecha_gt","hora_gt","anio","mes_num","mes_nombre","dia_semana","hora","semana_iso","timestamp_unix",
    "id","user.username","rawContent","replyCount","retweetCount","likeCount","quoteCount"
]
final_cols = [c for c in front_cols if c in df.columns] + [c for c in df.columns if c not in front_cols]
df = df[final_cols]
cols_to_drop = ["semana_iso", "timestamp_unix"]
df = df.drop(columns=[c for c in cols_to_drop if c in df.columns])
df.head(3)


Unnamed: 0,fecha_gt,hora_gt,anio,mes_num,mes_nombre,dia_semana,hora,id,user.username,rawContent,...,likeCount,quoteCount,date,user.id,mentionedUsers,clean_text,mentions,datetime_utc,datetime_gt,dia
0,2024-09-12,08:22:06,2024,9,Septiembre,Jueves,8,1834236045598056867,traficogt,Es comprensible la resolución... El ruso sabe ...,...,1,0,2024-09-12 14:22:06+00:00,93938886,[],comprensible resolución ruso sabe engrasar maq...,[],2024-09-12 14:22:06+00:00,2024-09-12 08:22:06-06:00,12
1,2024-09-11,18:39:56,2024,9,Septiembre,Miércoles,18,1834029142565658846,monymmorales,La corrupción de la @CC_Guatemala\nes descarad...,...,84,4,2024-09-12 00:39:56+00:00,976875408,"[{'id': 783345301256073216, 'id_str': '7833453...",corrupción descarada falsificación documentos ...,[cc_guatemala],2024-09-12 00:39:56+00:00,2024-09-11 18:39:56-06:00,11
2,2024-09-11,19:21:04,2024,9,Septiembre,Miércoles,19,1834039491826180424,animaldgalaccia,@PNCdeGuatemala @mingobguate @FJimenezmingob @...,...,1,0,2024-09-12 01:21:04+00:00,1730828822029750272,"[{'id': 130315077, 'id_str': '130315077', 'use...",,"[pncdeguatemala, mingobguate, fjimenezmingob, ...",2024-09-12 01:21:04+00:00,2024-09-11 19:21:04-06:00,11


In [35]:
import networkx as nx

# Grafo dirigido: usuario -> menciones
G = nx.DiGraph()

for _, row in df.iterrows():
    user = row["user.username"]
    mentions = row["mentions"]
    for m in mentions:
        G.add_edge(user, m, interaction="mention")

len(G.nodes), len(G.edges)


(2720, 7336)

In [36]:
output_path = Path("data/traficogt_clean.csv")
df.to_csv(output_path, index=False, encoding="utf-8")
print(f"CSV guardado en {output_path.resolve()}")


CSV guardado en C:\Users\thiag\Universidad\social-networks\data\traficogt_clean.csv
