In [22]:
import pandas as pd
import nltk
import re
import emoji
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer


nltk.download('vader_lexicon')
nltk.download('stopwords')


from nltk.sentiment import SentimentIntensityAnalyzer
from transformers import pipeline

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/tomy07417/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/tomy07417/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")

with open("./words_desastres.txt", "r", encoding="utf-8") as f:
    words_desastres = f.read().splitlines()

In [24]:
sia = SentimentIntensityAnalyzer()
sentiment_analyzer = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment-latest"
)

train_df[['neg','neu','pos','compound']] = pd.DataFrame(
    train_df.text.map(lambda x: sia.polarity_scores(x)).tolist(),
    index=train_df.index
)

train_df = train_df.join(
    train_df['text'].apply(lambda x: pd.Series(sentiment_analyzer(x)[0]))
).rename(columns={'label': 'sentimiento', 'score': 'score_global'})

train_df['len'] = train_df.text.map(len)

tokenizer = TweetTokenizer(
    preserve_case=False,  
    strip_handles=True,   
    reduce_len=True       
)

def tokens(texto):
    # 1Ô∏è‚É£ Reemplazar URLs
    texto = re.sub(r"http\S+|www\S+", " url ", texto)
    texto = re.sub(r"\s+", " ", texto).strip()

    # 2Ô∏è‚É£ Tokenizar
    toks = tokenizer.tokenize(texto)

    # 3Ô∏è‚É£ Filtrar tokens (por ejemplo, eliminar signos o tokens cortos)
    toks = [t for t in toks if len(t) > 2 and re.match(r"^[a-z#]+$", t)]

    return toks

def count_claves(texto):
    text_tokens = tokens(texto)
    count = sum(1 for word in words_desastres[:50] if word in text_tokens)
    return count

def count_hashtags(texto):
    text_tokens = [t for t in tokenizer.tokenize(texto) if t.startswith('#')]
    return len(text_tokens)

train_df[['tiene_url', 'palabras_claves', 'hashtags']] = train_df.text.apply(lambda x: pd.Series(["SI", count_claves(x), count_hashtags(x)] if "url" in tokens(x) else ["NO", count_claves(x), count_hashtags(x)]))


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [25]:
def limpiar_texto(texto):
    # Reemplazar URLs por token
    texto = re.sub(r"http\S+|www\S+", " URL ", texto)
    # Reemplazar menciones (@usuario)
    texto = re.sub(r"@\w+", " USER ", texto)   
    # Reemplazar hashtags (dejando la palabra)
    texto = re.sub(r"#(\w+)", r" HASHTAG_\1 ", texto) 
    # Reemplazar emojis (usando su significado textual)
    texto = emoji.demojize(texto, language="en")  # üòä ‚Üí :cara_sonriente:
    texto = re.sub(r":([a-zA-Z0-9_]+):", r" EMOJI_\1 ", texto)
    # Eliminar caracteres especiales innecesarios
    texto = re.sub(r"[^a-zA-Z√°√©√≠√≥√∫√Å√â√ç√ì√ö√±√ë0-9_\s]", " ", texto) 
    # Pasar a min√∫sculas
    texto = texto.lower() 
    # Quitar espacios m√∫ltiples
    texto = re.sub(r"\s+", " ", texto).strip()
    
    return texto

def tokenize_tweet(text):
    return tokenizer.tokenize(limpiar_texto(text))

vectorizer = CountVectorizer(
    tokenizer=tokenize_tweet,
    lowercase=True,
    stop_words='english',
    max_features=1000
)

X_bow = vectorizer.fit_transform(train_df.text)
df_bow = pd.DataFrame(X_bow.toarray(), columns=vectorizer.get_feature_names_out())
train_df = pd.concat([train_df.reset_index(drop=True), df_bow.reset_index(drop=True)], axis=1)




In [26]:
test_df[['neg','neu','pos','compound']] = pd.DataFrame(
    test_df.text.map(lambda x: sia.polarity_scores(x)).tolist(),
    index=test_df.index
)

test_df = test_df.join(
    test_df['text'].apply(lambda x: pd.Series(sentiment_analyzer(x)[0]))
).rename(columns={'label': 'sentimiento', 'score': 'score_global'})

test_df['len'] = test_df.text.map(len)

test_df[['tiene_url', 'palabras_claves', 'hashtags']] = test_df.text.apply(lambda x: pd.Series(["SI", count_claves(x), count_hashtags(x)] if "url" in tokens(x) else ["NO", count_claves(x), count_hashtags(x)]))


In [27]:
X_bow_test = vectorizer.transform(test_df.text)
df_bow_test = pd.DataFrame(X_bow_test.toarray(), columns=vectorizer.get_feature_names_out())
test_df = pd.concat([test_df.reset_index(drop=True), df_bow_test.reset_index(drop=True)], axis=1)

In [28]:
train_df.tail(20)

Unnamed: 0,id,keyword,location,text,target,neg,neu,pos,compound,sentimiento,...,x,yeah,year,years,yes,york,young,youth,zone,√≥
7593,10848,,,I just heard a really loud bang and everyone i...,0,0.0,0.687,0.313,0.6249,positive,...,0,0,0,0,0,0,0,0,0,0
7594,10849,,,A gas thing just exploded and I heard screams ...,1,0.138,0.862,0.0,-0.3736,negative,...,0,0,0,0,0,0,0,0,0,0
7595,10850,,,NWS: Flash Flood Warning Continued for Shelby ...,1,0.156,0.844,0.0,-0.34,neutral,...,0,0,0,0,0,0,0,0,0,0
7596,10851,,,RT @LivingSafely: #NWS issues Severe #Thunders...,1,0.345,0.655,0.0,-0.7841,neutral,...,0,0,0,0,0,0,0,0,0,0
7597,10852,,,#??? #?? #??? #??? MH370: Aircraft debris foun...,1,0.157,0.843,0.0,-0.4871,negative,...,0,0,0,0,0,0,0,0,0,0
7598,10853,,,Father-of-three Lost Control of Car After Over...,1,0.187,0.813,0.0,-0.3182,neutral,...,0,0,0,0,0,0,0,0,0,0
7599,10854,,,1.3 #Earthquake in 9Km Ssw Of Anza California ...,1,0.0,1.0,0.0,0.0,neutral,...,0,0,0,0,0,0,0,0,0,0
7600,10855,,,Evacuation order lifted for town of Roosevelt:...,1,0.0,1.0,0.0,0.0,neutral,...,0,0,0,0,0,0,0,0,0,0
7601,10859,,,#breaking #LA Refugio oil spill may have been ...,1,0.0,1.0,0.0,0.0,neutral,...,0,0,0,0,0,0,0,0,0,0
7602,10860,,,a siren just went off and it wasn't the Forney...,1,0.201,0.799,0.0,-0.4137,negative,...,0,0,0,0,0,0,0,0,0,0


In [29]:
train_df.to_csv("./data/train_with_features.csv", index=False, encoding="utf-8")

In [30]:
# Guardar el DataFrame principal como CSV (sin √≠ndice)
test_df.to_csv("./data/test_with_features.csv", index=False, encoding="utf-8")