In [5]:
pip install vaderSentiment


  pid, fd = os.forkpty()


Note: you may need to restart the kernel to use updated packages.


In [6]:
import os
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import re
import zipfile

# Directory paths for tweets and prices
train_tweet_dir = '/kaggle/input/multimodalai-prediction/public_dataset/train/tweets'
price_data_dir = '/kaggle/input/multimodalai-prediction/public_dataset/train/price_data'
output_price_data_dir = '/kaggle/working/price_data'  # Create a directory in the working space

# Create the output directory if it doesn't exist
os.makedirs(output_price_data_dir, exist_ok=True)

# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to clean the tweet
def clean_tweet(tweet):
    tweet = re.sub(r"http\S+", "", tweet)  # Remove URLs
    tweet = re.sub(r"@\w+", "", tweet)     # Remove mentions
    tweet = re.sub(r"#\w+", "", tweet)     # Remove hashtags
    tweet = re.sub(r"[^\w\s]", "", tweet)  # Remove punctuation
    tweet = tweet.lower()                  # Convert to lowercase
    return tweet

# Function to get sentiment score using VADER
def get_sentiment_vader(tweet):
    sentiment = analyzer.polarity_scores(tweet)
    return sentiment['compound']

# Process tweets and price data
def process_coin_data(coin_name):
    # Load tweet data for the coin
    coin_tweet_folder = os.path.join(train_tweet_dir, coin_name)
    tweet_files = os.listdir(coin_tweet_folder)

    all_tweets = []
    for tweet_file in tweet_files:
        file_path = os.path.join(coin_tweet_folder, tweet_file)
        
        try:
            # Extract the date from the filename (assuming the filename contains the date)
            tweet_date = pd.to_datetime(tweet_file.replace('.csv', ''), format='%Y-%m-%d').date()

            # Load the tweets for that day
            tweet_data = pd.read_csv(file_path, engine='python', on_bad_lines='skip')
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
            continue

        if 'tweet' in tweet_data.columns:
            # Clean the tweet text and get sentiment score
            tweet_data['cleaned_tweet'] = tweet_data['tweet'].apply(lambda x: clean_tweet(x) if pd.notnull(x) else '')
            tweet_data['sentiment_score'] = tweet_data['cleaned_tweet'].apply(lambda x: get_sentiment_vader(x) if x != '' else 0)
            
            # Add the date from the filename to the tweet data
            tweet_data['date'] = tweet_date  
            all_tweets.append(tweet_data[['date', 'sentiment_score']])

    if all_tweets:
        # Combine all tweets into a single DataFrame
        all_tweets_df = pd.concat(all_tweets)
        
        # Group by date and calculate the average sentiment score for each day
        daily_sentiment = all_tweets_df.groupby('date')['sentiment_score'].mean().reset_index()

        # Debugging: print daily sentiment data
        print(f"\nSentiment data for {coin_name}:")
        print(daily_sentiment.head(), len(daily_sentiment))  # Show the number of rows
    else:
        print(f"No sentiment data available for {coin_name}")
        daily_sentiment = pd.DataFrame(columns=['date', 'sentiment_score'])

    # Process price data
    price_file_path = os.path.join(price_data_dir, f"{coin_name}.csv")
    try:
        price_data = pd.read_csv(price_file_path)

        # Ensure datetime column is in proper format
        price_data['datetime'] = pd.to_datetime(price_data['datetime'])
        price_data['date'] = price_data['datetime'].dt.date  # Extract date from datetime

        # Debugging: print price data before merging
        print(f"\nPrice data for {coin_name} before merging:")
        print(price_data[['datetime', 'date']].head())

        # Merge sentiment with price data on date
        price_data = pd.merge(price_data, daily_sentiment, on='date', how='left')

        # Debugging: print price data after merging
        print(f"\nPrice data for {coin_name} after merging:")
        print(price_data[['datetime', 'date', 'sentiment_score']].head())

        # Handle NaN values in sentiment_score by filling with 0
        price_data['sentiment_score'] = price_data['sentiment_score'].fillna(0)

        # Optionally, drop the 'date' column if not needed anymore
        price_data.drop(columns='date', inplace=True)

        # Save the updated price data with sentiment scores to the output directory
        output_file_path = os.path.join(output_price_data_dir, f"{coin_name}.csv")
        price_data.to_csv(output_file_path, index=False)

    except Exception as e:
        print(f"Error processing price data for {coin_name}: {e}")

# Process and update sentiment for all coins
for coin_name in os.listdir(train_tweet_dir):
    print(f"Processing data for {coin_name}")
    process_coin_data(coin_name)

print("Sentiment scores added to price data.")

# Create a zip file of all updated CSVs
zip_file_path = '/kaggle/working/price_data.zip'
with zipfile.ZipFile(zip_file_path, 'w') as zipf:
    for root, dirs, files in os.walk(output_price_data_dir):
        for file in files:
            if file.endswith('.csv'):
                zipf.write(os.path.join(root, file), file)  # Write only the file name to zip

print(f"All updated price data files have been zipped into {zip_file_path}.")


Processing data for DIA

Sentiment data for DIA:
         date  sentiment_score
0  2020-08-18         0.187630
1  2020-08-19         0.181689
2  2020-08-20         0.149146
3  2020-08-21         0.163856
4  2020-08-22         0.178643 163

Price data for DIA before merging:
    datetime        date
0 2020-08-18  2020-08-18
1 2020-08-19  2020-08-19
2 2020-08-20  2020-08-20
3 2020-08-21  2020-08-21
4 2020-08-22  2020-08-22

Price data for DIA after merging:
    datetime        date  sentiment_score
0 2020-08-18  2020-08-18         0.187630
1 2020-08-19  2020-08-19         0.181689
2 2020-08-20  2020-08-20         0.149146
3 2020-08-21  2020-08-21         0.163856
4 2020-08-22  2020-08-22         0.178643
Processing data for XSR

Sentiment data for XSR:
         date  sentiment_score
0  2020-08-06         0.384975
1  2020-08-07         0.204829
2  2020-08-08         0.050150
3  2020-08-09         0.123733
4  2020-08-10         0.000000 44

Price data for XSR before merging:
    datetime  

  price_data['sentiment_score'] = price_data['sentiment_score'].fillna(0)



Sentiment data for ZEN:
         date  sentiment_score
0  2019-02-19         0.179544
1  2019-02-20         0.206643
2  2019-02-21         0.143484
3  2019-02-22         0.289035
4  2019-02-23         0.208258 373

Price data for ZEN before merging:
    datetime        date
0 2018-02-12  2018-02-12
1 2018-02-13  2018-02-13
2 2018-02-14  2018-02-14
3 2018-02-15  2018-02-15
4 2018-02-16  2018-02-16

Price data for ZEN after merging:
    datetime        date  sentiment_score
0 2018-02-12  2018-02-12              NaN
1 2018-02-13  2018-02-13              NaN
2 2018-02-14  2018-02-14              NaN
3 2018-02-15  2018-02-15              NaN
4 2018-02-16  2018-02-16              NaN
Processing data for DOTUP

Sentiment data for DOTUP:
         date  sentiment_score
0  2020-09-11         0.291337
1  2020-09-12         0.317445
2  2020-09-13         0.292403
3  2020-09-14         0.333211
4  2020-09-15         0.318200 94

Price data for DOTUP before merging:
    datetime        date
0 2020-