In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/bitcoin-tweets-16m-tweets-with-sentiment-tagged/mbsa.csv


In [2]:
# 1. INSTALLATION AND IMPORTS

# tqdm is useful for progress bars, which is essential for a large dataset
!pip install tqdm -q

import pandas as pd
import numpy as np
import re
import logging
from tqdm.auto import tqdm

# For splitting the data
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# For VADER sentiment analysis
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

# Download necessary NLTK resources for VADER
nltk.download('vader_lexicon', quiet=True)

# Initialize tqdm for pandas to see progress on .apply() operations
tqdm.pandas()

logging.info("Libraries imported and ready.")

2025-11-28 18:58:29 - INFO - Libraries imported and ready.


In [3]:
# 2. LOAD AND EXPLORE DATA
import logging
import pandas as pd

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

logging.info("--- Step 2: Loading and Exploring Data ---")

# The path to the file in the Kaggle environment
file_path = '/kaggle/input/bitcoin-tweets-16m-tweets-with-sentiment-tagged/mbsa.csv'

# For development, it is HIGHLY recommended to load only a sample.
# Once the code is confirmed to work, remove 'nrows' to load all 16M tweets.
df = pd.read_csv(file_path, nrows=500000) # Tip: start with 500k rows

logging.info(f"Number of tweets loaded: {len(df)}")
logging.info(f"Available columns: {df.columns.tolist()}")

logging.info("First 5 rows preview:")
# We convert the dataframe head to string to log it properly
logging.info("\n" + str(df.head()))

logging.info("Sentiment distribution:")
logging.info("\n" + str(df['Sentiment'].value_counts(normalize=True)))

2025-11-28 18:59:17 - INFO - --- Step 2: Loading and Exploring Data ---
2025-11-28 18:59:20 - INFO - Number of tweets loaded: 500000
2025-11-28 18:59:20 - INFO - Available columns: ['Date', 'text', 'Sentiment']
2025-11-28 18:59:20 - INFO - First 5 rows preview:
2025-11-28 18:59:20 - INFO - 
         Date                                               text Sentiment
0  2019-05-27  È appena uscito un nuovo video! LES CRYPTOMONN...  Positive
1  2019-05-27  Cardano: Digitize Currencies; EOS https://t.co...  Positive
2  2019-05-27  Another Test tweet that wasn't caught in the s...  Positive
3  2019-05-27  Current Crypto Prices! \n\nBTC: $8721.99 USD\n...  Positive
4  2019-05-27  Spiv (Nosar Baz): BITCOIN Is An Asset &amp; NO...  Positive
2025-11-28 18:59:20 - INFO - Sentiment distribution:
2025-11-28 18:59:20 - INFO - 
Sentiment
Positive    0.675518
Negative    0.324482
Name: proportion, dtype: float64


In [4]:
# 3. CLEAN AND PREPARE DATA
import logging
import re

logging.info("--- Step 3: Cleaning and Preparing Data ---")

def clean_tweet(text):
    """
    Function to clean the text of a tweet.
    - Removes URLs
    - Removes mentions (@username)
    - Removes hashtags (#hashtag)
    - Removes non-alphanumeric characters (except spaces)
    - Converts to lowercase
    - Removes extra whitespace
    """
    if not isinstance(text, str):
        return ""
    text = re.sub(r'https?://\S+|www\.\S+', '', text) # Remove URLs
    text = re.sub(r'@\w+', '', text)                 # Remove mentions
    text = re.sub(r'#\w+', '', text)                 # Remove hashtags
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)       # Remove special characters
    text = text.lower()                              # Convert to lowercase
    text = re.sub(r'\s+', ' ', text).strip()         # Remove extra whitespace
    return text

# Apply the cleaning function with a progress bar
# Ensure tqdm.pandas() was run in Step 1
df['cleaned_text'] = df['text'].progress_apply(clean_tweet)

# Drop tweets that are empty after cleaning
df.dropna(subset=['cleaned_text'], inplace=True)
df = df[df['cleaned_text'] != '']

# Map sentiments to numerical values for models
# Positive -> 1, Negative -> 0. We ignore others for this binary case.
df = df[df['Sentiment'].isin(['Positive', 'Negative'])]
df['label'] = df['Sentiment'].map({'Positive': 1, 'Negative': 0})

logging.info("Preview after cleaning and preparation:")
logging.info("\n" + str(df[['cleaned_text', 'Sentiment', 'label']].head()))

2025-11-28 19:00:11 - INFO - --- Step 3: Cleaning and Preparing Data ---


  0%|          | 0/500000 [00:00<?, ?it/s]

2025-11-28 19:00:19 - INFO - Preview after cleaning and preparation:
2025-11-28 19:00:19 - INFO - 
                                        cleaned_text Sentiment  label
0  appena uscito un nuovo video les cryptomonnaie...  Positive      1
1  cardano digitize currencies eos 6500 roi atamp...  Positive      1
2  another test tweet that wasnt caught in the st...  Positive      1
3  current crypto prices btc 872199 usd eth 26662...  Positive      1
4  spiv nosar baz bitcoin is an asset amp not a c...  Positive      1


In [5]:
# 4. CREATE DATA SPLITS (TRAIN/VALIDATION/TEST)
import logging
from sklearn.model_selection import train_test_split

logging.info("--- Step 4: Creating Data Splits ---")

X = df['cleaned_text']
y = df['label']

# Step A: Split into Train (70%) and a temporary set (30%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Step B: Split the temporary set into Validation (15%) and Test (15%)
# (0.5 * 30% = 15%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

logging.info(f"Training set size   : {len(X_train)} ({len(X_train)/len(df):.0%})")
logging.info(f"Validation set size : {len(X_val)} ({len(X_val)/len(df):.0%})")
logging.info(f"Test set size       : {len(X_test)} ({len(X_test)/len(df):.0%})")

2025-11-28 19:01:16 - INFO - --- Step 4: Creating Data Splits ---
2025-11-28 19:01:17 - INFO - Training set size   : 323470 (70%)
2025-11-28 19:01:17 - INFO - Validation set size : 69315 (15%)
2025-11-28 19:01:17 - INFO - Test set size       : 69316 (15%)


In [6]:
import pandas as pd
import numpy as np
import re
import os
import logging
from tqdm import tqdm

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

# Configuration
csv_file = '/kaggle/input/bitcoin-tweets-16m-tweets-with-sentiment-tagged/mbsa.csv'
chunk_size = 100000 
probs = [0.70, 0.15, 0.15] # 70% Train, 15% Val, 15% Test

# Output files
train_file = 'train_full.csv'
val_file = 'val_full.csv'
test_file = 'test_full.csv'

# Delete existing files to start fresh
for f in [train_file, val_file, test_file]:
    if os.path.exists(f):
        os.remove(f)
        logging.info(f"Removed existing file: {f}")

logging.info("Generating files (1M sample for speed)...")

# Text cleaning function
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+', '', text)    # Remove URLs
    text = re.sub(r'[^\w\s]', '', text)    # Remove punctuation
    return text.strip()

# Label mapping
label_map = {'negative': 0, 'Negative': 0, 'neutral': 1, 'Neutral': 1, 'positive': 2, 'Positive': 2}

# Data reading (Limited to 1M rows for this run)
reader = pd.read_csv(csv_file, chunksize=chunk_size, nrows=1000000,
                     on_bad_lines='skip', engine='python',
                     usecols=lambda c: c.lower() in ['text', 'sentiment'])

for i, chunk in enumerate(tqdm(reader, desc="Processing")):
    chunk = chunk.copy()
    
    # Standardization
    chunk.columns = [c.lower() for c in chunk.columns]
    chunk.dropna(inplace=True)
    
    # Cleaning
    chunk['cleaned_text'] = chunk['text'].apply(clean_text)
    chunk = chunk[chunk['cleaned_text'] != ""]
    
    # Label Encoding
    chunk['label'] = chunk['sentiment'].map(label_map)
    chunk = chunk.dropna(subset=['label'])
    chunk['label'] = chunk['label'].astype(int)
    
    # Train/Val/Test Split
    split = np.random.choice(['train', 'val', 'test'], size=len(chunk), p=probs)
    
    # Saving to CSV
    mode = 'a' if i > 0 else 'w'
    header = (i == 0)
    
    chunk[split == 'train'][['cleaned_text', 'label']].to_csv(train_file, mode=mode, header=header, index=False)
    chunk[split == 'val'][['cleaned_text', 'label']].to_csv(val_file, mode=mode, header=header, index=False)
    chunk[split == 'test'][['cleaned_text', 'label']].to_csv(test_file, mode=mode, header=header, index=False)

logging.info("Files regenerated! You can restart VADER training.")

2025-11-28 19:01:41 - INFO - Generating files (1M sample for speed)...
Processing: 10it [00:16,  1.68s/it]
2025-11-28 19:01:57 - INFO - Files regenerated! You can restart VADER training.


In [7]:
import pandas as pd
import json
import os
import logging
from collections import Counter
from nltk.corpus import stopwords
import nltk

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

# Config
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

logging.info("Starting vocabulary learning...")

# 1. Load training data (1 Million rows is sufficient for learning)
if os.path.exists('train_full.csv'):
    # Read only useful columns
    df_train = pd.read_csv('train_full.csv', nrows=1000000)
    
    # 2. Clean labels (Safety: ensure we have 0 and 1)
    def clean_label(x):
        # If it's 2, 'Positive', or 'positive' -> convert to 1
        if str(x) in ['2', 'Positive', 'positive', '1']: return 1
        return 0 # Otherwise 0
    
    df_train['label_bin'] = df_train['label'].apply(clean_label)
    
    logging.info("Analyzing word frequencies (approx 30 sec)...")
    
    # 3. Separate texts
    # Convert everything to string to avoid bugs
    pos_text = ' '.join(df_train[df_train['label_bin'] == 1]['cleaned_text'].astype(str))
    neg_text = ' '.join(df_train[df_train['label_bin'] == 0]['cleaned_text'].astype(str))
    
    # 4. Counting
    pos_counts = Counter(pos_text.split())
    neg_counts = Counter(neg_text.split())
    
    # 5. Calculate Score for each word
    new_lexicon = {}
    all_words = set(pos_counts.keys()).union(set(neg_counts.keys()))
    
    logging.info(f"Calculating scores for {len(all_words)} unique words...")
    
    for word in all_words:
        # Ignore short words or stop words
        if len(word) < 3 or word in stop_words: continue
        
        p = pos_counts[word]
        n = neg_counts[word]
        total = p + n
        
        # Word must appear at least 50 times to be reliable
        if total < 50: continue
        
        ratio = p / total
        
        score = 0
        # If the word appears > 75% in positive tweets -> Positive Score
        if ratio > 0.75: 
            score = 2.0 + (ratio * 2.0) # Max 4.0
        # If the word appears < 25% (mostly negative) -> Negative Score
        elif ratio < 0.25:
            score = -2.0 - ((1-ratio) * 2.0) # Max -4.0
            
        if score != 0:
            new_lexicon[word] = round(score, 2)

    # 6. Save
    with open('crypto_lexicon_final.json', 'w') as f:
        json.dump(new_lexicon, f)
        
    logging.info(f"SUCCESS: File 'crypto_lexicon_final.json' created with {len(new_lexicon)} words.")
    logging.info("You can now run Step 2!")

else:
    logging.error("ERROR: Could not find 'train_full.csv'. Did you run the Split step?")

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2025-11-28 19:02:34 - INFO - Starting vocabulary learning...
2025-11-28 19:02:36 - INFO - Analyzing word frequencies (approx 30 sec)...
2025-11-28 19:02:40 - INFO - Calculating scores for 562587 unique words...
2025-11-28 19:02:40 - INFO - SUCCESS: File 'crypto_lexicon_final.json' created with 4557 words.
2025-11-28 19:02:40 - INFO - You can now run Step 2!


In [8]:
import pandas as pd
import json
import nltk
import logging
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tqdm import tqdm

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

# Config
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

logging.info("--- FINAL VADER EVALUATION (HYBRID) ---")

# 1. Load Automatic Lexicon (the one created in the previous step)
try:
    with open('crypto_lexicon_final.json', 'r') as f:
        auto_lexicon = json.load(f)
    logging.info(f"Automatic Lexicon loaded: {len(auto_lexicon)} words.")
except FileNotFoundError:
    logging.warning("ERROR: JSON file not found. Please rerun Step 1!")
    auto_lexicon = {}

# 2. Manual Lexicon (Priority)
manual_lexicon = {
    'hodl': 3.5, 'moon': 4.0, 'bullish': 3.5, 'ath': 3.5, 'pump': 2.5,
    'bearish': -3.5, 'fud': -3.0, 'dump': -3.5, 'scam': -4.0, 'rekt': -4.0,
    'btc': 1.0, 'bitcoin': 1.0, 'buy': 2.5, 'long': 2.5, 'short': -2.5, 'sell': -2.5
}

# 3. Merge and Update VADER
sid.lexicon.update(auto_lexicon)    # AI/Statistical lexicon first
sid.lexicon.update(manual_lexicon)  # Manual lexicon second (overwrites duplicates)
logging.info("VADER updated and ready.")

# 4. Prediction on Test Set
logging.info("Loading Test Set...")
test_df = pd.read_csv('test_full.csv') 

# Fix labels (Ensure binary 0/1)
test_df['label_bin'] = test_df['label'].apply(lambda x: 1 if str(x) in ['2', 'Positive', 'positive', '1'] else 0)

# Prediction function
predictions = []
threshold = 0.0 # Strict threshold for binary classification

logging.info("Predictions in progress...")
for text in tqdm(test_df['cleaned_text'].astype(str)):
    try:
        score = sid.polarity_scores(text)['compound']
        predictions.append(1 if score > threshold else 0)
    except:
        predictions.append(0)

# 5. Results
acc = accuracy_score(test_df['label_bin'], predictions)
logging.info("="*40)
logging.info(f"FINAL VADER SCORE: {acc:.2%}")
logging.info("="*40)

logging.info("Confusion Matrix:")
# Logging the matrix as a string to preserve structure
logging.info("\n" + str(confusion_matrix(test_df['label_bin'], predictions)))

logging.info("Detailed Report:")
logging.info("\n" + classification_report(test_df['label_bin'], predictions, target_names=['Negative', 'Positive']))

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
2025-11-28 19:02:59 - INFO - --- FINAL VADER EVALUATION (HYBRID) ---
2025-11-28 19:02:59 - INFO - Automatic Lexicon loaded: 4557 words.
2025-11-28 19:02:59 - INFO - VADER updated and ready.
2025-11-28 19:02:59 - INFO - Loading Test Set...
2025-11-28 19:03:00 - INFO - Predictions in progress...
100%|██████████| 148664/148664 [00:26<00:00, 5513.70it/s]
2025-11-28 19:03:27 - INFO - FINAL VADER SCORE: 69.80%
2025-11-28 19:03:27 - INFO - Confusion Matrix:
2025-11-28 19:03:27 - INFO - 
[[ 8058 33989]
 [10904 95713]]
2025-11-28 19:03:27 - INFO - Detailed Report:
2025-11-28 19:03:27 - INFO - 
              precision    recall  f1-score   support

    Negative       0.42      0.19      0.26     42047
    Positive       0.74      0.90      0.81    106617

    accuracy                           0.70    148664
   macro avg       0.58      0.54      0.5