In [21]:
from pathlib import Path
import json
import pandas as pd
import re
from datetime import datetime
import numpy as np

In [22]:
df_data = pd.read_csv("./traficogt_flat.csv")

In [23]:
print("Información inicial del dataset:")
print(f"Filas: {df_data.shape[0]}, Columnas: {df_data.shape[1]}")
print("\nPrimeras filas:")
print(df_data.head())
print("\nTipos de datos:")
print(df_data.dtypes)
print("\nValores nulos por columna:")
print(df_data.isnull().sum())

Información inicial del dataset:
Filas: 5604, Columnas: 44

Primeras filas:
                    id                       date             user  \
0  1834236045598056867  2024-09-12 14:22:06+00:00        traficogt   
1  1834029142565658846  2024-09-12 00:39:56+00:00     monymmorales   
2  1834039491826180424  2024-09-12 01:21:04+00:00  animaldgalaccia   
3  1833963729136091179  2024-09-11 20:20:01+00:00   EstacionDobleA   
4  1833665391698092330  2024-09-11 00:34:31+00:00       CubReserva   

               user_id           user_displayname  \
0             93938886                  traficoGT   
1            976875408                       Mony   
2  1730828822029750272           Jairo De La Nada   
3  1802661334355456000           Estación Doble A   
4  1155617398675988481  CUB Reserva Kanajuyu Z 16   

                                 user_rawDescription  \
0                    Noticias de ciudad de Guatemala   
1  Iglesia y estado son asunto separado.\nCatólic...   
2  Sancarlista. 

In [24]:
# selleccion de data usuarios
df_users = df_data[["user", "user_id", "user_displayname", "user_rawDescription", 
                   "user_created", "user_followersCount", "user_favouritesCount", 
                   "user_listedCount", "user_mediaCount", "user_location", 
                   "user_protected", "user_verified", "user_blue", "user_blueType", 
                   "user_descriptionLinks", "user_pinnedIds"]].copy()

# limpieza na
df_users = df_users.fillna("NA")

In [25]:
df_users['user'] = df_users['user'].astype(str).str.lower()
df_users['user_displayname'] = df_users['user_displayname'].astype(str).str.lower()

In [26]:
# limpieza de la descripcion del usuario
def clean_description(text):
    if text == "NA" or pd.isna(text):
        return "NA"
    
    # Convertir a string y minúsculas
    text = str(text).lower()
    
    # Remover URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remover menciones (@usuario)
    text = re.sub(r'@\w+', '', text)
    
    # Remover hashtags (#)
    text = re.sub(r'#\w+', '', text)
    
    # Remover caracteres especiales excepto espacios y letras básicas
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remover números si se considera necesario
    text = re.sub(r'\d+', '', text)
    
    # Remover espacios extra
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text if text != "" else "NA"

df_users['user_rawDescription'] = df_users['user_rawDescription'].apply(clean_description)

In [27]:
def clean_location(location):
    if location == "NA" or pd.isna(location):
        return "NA"
    
    location = str(location).lower()
    # Remover caracteres especiales
    location = re.sub(r'[^\w\s]', '', location)
    location = re.sub(r'\s+', ' ', location).strip()
    
    return location if location != "" else "NA"

df_users['user_location'] = df_users['user_location'].apply(clean_location)

In [28]:
# limpieza de la columna pinned ids
def clean_pinned_ids(value):
    if value == "NA" or pd.isna(value) or value == '':
        return "NA"
    try:
        # Intentar convertir a entero y luego a string
        return str(int(float(value)))
    except (ValueError, TypeError):
        return "NA"

df_users['user_pinnedIds'] = df_users['user_pinnedIds'].apply(clean_pinned_ids)

In [29]:
#limpiar descripction_links
def clean_description_links(value):
    if value == "NA" or pd.isna(value) or value == '':
        return "NA"
    try:
        # Si es una lista en formato string, convertir a lista limpia
        if isinstance(value, str) and value.startswith('[') and value.endswith(']'):
            links = eval(value)
            if isinstance(links, list):
                return ', '.join([str(link).lower() for link in links])
        return str(value).lower()
    except:
        return "NA"

df_users['user_descriptionLinks'] = df_users['user_descriptionLinks'].apply(clean_description_links)

In [30]:
# 7. Convertir columnas booleanas
bool_columns = ['user_protected', 'user_verified', 'user_blue']
for col in bool_columns:
    df_users[col] = df_users[col].apply(lambda x: True if str(x).lower() in ['true', '1', 'yes'] else False)

# %%
# 8. Convertir user_created a datetime
df_users['user_created'] = pd.to_datetime(df_users['user_created'], errors='coerce')
# Para los valores que no se pudieron convertir, usar una fecha por defecto o "NA"
df_users['user_created'] = df_users['user_created'].fillna(pd.Timestamp('2000-01-01'))

numeric_columns = ['user_followersCount', 'user_favouritesCount', 'user_listedCount', 'user_mediaCount']
for col in numeric_columns:
    df_users[col] = pd.to_numeric(df_users[col], errors='coerce').fillna(0).astype(int)

In [31]:
# Eliminar duplicados basados en user_id
print(f"Filas antes de eliminar duplicados: {len(df_users)}")
df_users = df_users.drop_duplicates(subset=['user_id'])
print(f"Filas después de eliminar duplicados: {len(df_users)}")

Filas antes de eliminar duplicados: 5604
Filas después de eliminar duplicados: 2071


In [32]:
df_users

Unnamed: 0,user,user_id,user_displayname,user_rawDescription,user_created,user_followersCount,user_favouritesCount,user_listedCount,user_mediaCount,user_location,user_protected,user_verified,user_blue,user_blueType,user_descriptionLinks,user_pinnedIds
0,traficogt,93938886,traficogt,noticias de ciudad de guatemala,2009-12-01 20:42:19+00:00,314368,3471,291,1292,guatemala,False,False,False,,0,
1,monymmorales,976875408,mony,iglesia y estado son asunto separado católica ...,2012-11-28 20:16:36+00:00,5502,274770,24,16644,guatemala frijolandia,False,False,False,,0,1794803010935431424
2,animaldgalaccia,1730828822029750272,jairo de la nada,sancarlista estudiante de filosofía ideológica...,2023-12-02 05:58:56+00:00,571,25257,2,1013,macondo,False,False,False,,1,1830424083143512576
3,estaciondoblea,1802661334355456000,estación doble a,si no ayuda no estorbe y no chingue,2024-06-17 11:16:06+00:00,18,30,0,11,,False,False,False,,0,
4,cubreserva,1155617398675988481,cub reserva kanajuyu z 16,preocupados por el medio ambiente somos una re...,2019-07-28 23:13:52+00:00,170,326,0,75,,False,False,False,,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5580,mincho_guate,2824297276,luis castellanos,,2014-10-12 03:37:39+00:00,74,1214,0,68,en algun lugar de ca,False,False,False,,0,
5585,luisvonahn,8381682,luis von ahn,ceo cofounder of invented recaptcha macarthur ...,2007-08-23 12:55:10+00:00,160235,1416,1114,139,pittsburgh pa,False,False,True,,1,
5588,bufeprofesional,713099124447416320,gerber garcia,,2016-03-24 20:24:14+00:00,17,45,0,22,,False,False,False,,0,
5598,lucreciave97613,1676049164948451328,lucrecia velásquez,ver noticias y así estar enterada de lo que su...,2023-07-04 02:05:18+00:00,9,1171,0,69,,False,False,False,,0,


In [33]:
print("\nValores nulos por columna:")
print(df_users.isnull().sum())


Valores nulos por columna:
user                     0
user_id                  0
user_displayname         0
user_rawDescription      0
user_created             0
user_followersCount      0
user_favouritesCount     0
user_listedCount         0
user_mediaCount          0
user_location            0
user_protected           0
user_verified            0
user_blue                0
user_blueType            0
user_descriptionLinks    0
user_pinnedIds           0
dtype: int64


In [34]:
df_users.to_csv("usuarios.csv", index= False)