In [None]:
from gnews import GNews
from datetime import datetime, timedelta
import csv
import os
import re
import nltk
from nltk.corpus import stopwords
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# Descargar stopwords en inglés si no están descargadas
nltk.download('stopwords')

# Cargar el modelo CryptoBERT y el tokenizer correspondiente
tokenizer = AutoTokenizer.from_pretrained("kk08/CryptoBERT", model_max_length=512)
model = AutoModelForSequenceClassification.from_pretrained("kk08/CryptoBERT")

# Crear el pipeline de análisis de sentimiento
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# Función para limpiar texto (incluye eliminación de stopwords)
def clean_text(text):
    # Eliminar URLs
    text = re.sub(r'http\S+', '', text)
    # Eliminar íconos y caracteres especiales
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenizar el texto
    words = text.lower().split()
    # Eliminar stopwords en inglés
    words = [word for word in words if word not in stopwords.words('english')]
    return " ".join(words)

# Función para traducir etiquetas de sentimiento
def translate_label(label):
    label_map = {
        'LABEL_0': 'negative',
        'LABEL_1': 'positive'
    }
    return label_map.get(label, 'unknown')

# Definir los keywords para cada token, incluyendo los generales bajo 'GENERAL'
crypto_keywords = {
    'BTC': ['Bitcoin', 'BTC', 'Satoshi', '₿', 'bitcoin', 'btc', 'satoshi'],
    'ETH': ['Ethereum', 'ETH', 'Ether', 'Ξ', 'ethereum', 'ether','eth'],
    'BNB': ['Binance Coin', 'BNB', 'binance coin', 'bnb'],
    'SOL': ['Solana', 'SOL', 'solana', 'sol'],
    'LTC': ['Litecoin', 'LTC', 'litecoin', 'ltc'],
    'LINK': ['Chainlink', 'LINK', 'chainlink', 'link'],
    'MATIC': ['Polygon', 'MATIC', 'polygon', 'matic'],
    'ADA': ['Cardano', 'cardano', 'ADA', 'ada'],
    'GENERAL': ['crypto', 'cryptocurrency', 'blockchain', 'DeFi', 'NFT', 'altcoin', 'stablecoin',
                'hodl', 'fomo', 'fud', 'bullish', 'bearish', 'pump', 'dump', 'moon', 'rekt']
}

# Función para obtener noticias, analizar el sentimiento y guardar en un archivo CSV
def fetch_news_and_analyze_sentiment(start_date, end_date, crypto_keywords):
    current_date = start_date

    # Crear o abrir el archivo CSV con codificación utf-8
    with open('crypto_news_sentiment.csv', mode='w', newline='', encoding='utf-8') as csv_file:
        fieldnames = ['DATE', 'TOKEN', 'CLEAN TEXT', 'LABEL']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()

        while current_date <= end_date:
            next_date = current_date + timedelta(days=1)
            if next_date > end_date:
                next_date = end_date

            for token, keywords in crypto_keywords.items():
                for query in keywords:
                    google_news = GNews(start_date=(current_date.year, current_date.month, current_date.day), 
                                        end_date=(next_date.year, next_date.month, next_date.day))
                    news = google_news.get_news(query)

                    if news:
                        for article in news:
                            title = article.get('title', '')
                            description = article.get('description', '')
                            combined_text = title + ' ' + description
                            
                            # Limpiar el texto
                            cleaned_text = clean_text(combined_text)
                            
                            # Analizar el sentimiento
                            sentiment_result = classifier(cleaned_text, truncation=True)[0]
                            
                            # Guardar los datos en el archivo CSV
                            writer.writerow({
                                'DATE': article.get('published date', ''),
                                'TOKEN': token,
                                'CLEAN TEXT': cleaned_text,
                                'LABEL': translate_label(sentiment_result['label'])
                            })
                        print(f"Processed and saved sentiment for {query} (Token: {token}) from {current_date.strftime('%Y-%m-%d')} to {next_date.strftime('%Y-%m-%d')}")
                    else:
                        print(f"No news found for {query} (Token: {token}) from {current_date.strftime('%Y-%m-%d')} to {next_date.strftime('%Y-%m-%d')}")

            current_date += timedelta(days=2)

# Define the date range
start_date = datetime(2023, 1, 1)
end_date = datetime(2024, 7, 31)

# Fetch news, analyze sentiment, and save results to CSV
fetch_news_and_analyze_sentiment(start_date, end_date, crypto_keywords)
