In [6]:
# What I will do is the following:
# Basic Scoring
# Sentiment label: Positive, Neutral, Negative (1, 0, -1)
# Sentiment score (Ss): A value between -1 and 1
# Model accuracy/confidence (Sa): A value between 0 and 1 (Can be calculated using the model's confidence score) 
# Then, to get the final score, I will use the following formula:
# Ss * Sa 

# I will also use a Weighted Sentiment Calculation for the Tweet-Level Calculations:
# Tr = retweet count
# Ti = like count
# Tc = comment count
# Tf = follower count
# a, b, c, d = weights for retweet, like, comment, and follow counts (Hyperparameters) 

# I will also take into consideration the user influence:
# Ui = user influence * E (Hyperparameter) (Personally Placed) 0.8

import pandas as pd

# Sample DataFrame
data = {
    'Category': ['A', 'A', 'B', 'C', 'B'],
    'Value': [10, 20, 30, 40, 50]
}
df = pd.DataFrame(data)

# Group by 'Category' and calculate mean and count
result = df.groupby('Category').agg({
    'Value': ['mean', 'count']
})

print(result)

         Value      
          mean count
Category            
A         15.0     2
B         40.0     2
C         40.0     1


In [4]:
# Import necessary libraries
from apify_client import ApifyClient
import pandas as pd
import json
import time
from datetime import datetime, timedelta
import os
from dotenv import load_dotenv
from Data.db import Database

# Initialize database connection
db = Database()

# Load environment variables (for API keys)
load_dotenv()

# Get API Token from environment variable
APIFY_API_TOKEN = os.getenv("APIFY_API_TOKEN")
print(f"APIFY_API_TOKEN: {APIFY_API_TOKEN}")
# Initialize the ApifyClient with your API token
client = ApifyClient(APIFY_API_TOKEN)

def fetch_tweets(search_terms, start_date, end_date, max_items=10, tweet_language="en", 
                min_retweets=0, min_favorites=0, only_verified=False):
    """
    Fetch tweets using the Twitter API through Apify
    
    Args:
        search_terms (list): List of search terms to query
        start_date (str): Start date in YYYY-MM-DD format
        end_date (str): End date in YYYY-MM-DD format
        max_items (int): Maximum number of tweets to retrieve per term
        tweet_language (str): Language of tweets to retrieve
        min_retweets (int): Minimum number of retweets
        min_favorites (int): Minimum number of favorites/likes
        only_verified (bool): Only include tweets from verified users
        
    Returns:
        list: List of tweet objects
    """
    all_tweets = []
    
    for term in search_terms:
        print(f"Fetching tweets for search term: {term}")
        
        # Prepare the Actor input
        run_input = {
            "searchTerms": [term],
            "tweetLanguage": tweet_language,
            "minimumRetweets": min_retweets,
            "minimumFavorites": min_favorites,
            "start": start_date,
            "end": end_date,
            "maxItems": max_items,
            "includeSearchTerms": True,
            "sort": "Latest"
        }
        
        if only_verified:
            run_input["filter"] = "verified"
        
        try:
            # Run the Actor and wait for it to finish
            run = client.actor("61RPP7dywgiy0JPD0").call(run_input=run_input)
            # Print the run details or error message
            print(f"Run details: {run}")
            # Fetch the Actor results
            for item in client.dataset(run["defaultDatasetId"]).iterate_items():
                # Add the search term to each tweet
                item['search_term'] = term
                all_tweets.append(item)
                
            print(f"Retrieved {len(all_tweets)} tweets for term: {term}")
            
            # Respect API rate limits
            time.sleep(2)
            
        except Exception as e:
            print(f"Error fetching tweets for term '{term}': {e}")
    
    return all_tweets

def process_tweet(tweet, search_term):
    """
    Process a single tweet and extract relevant fields without sentiment analysis
    
    Args:
        tweet (dict): Tweet object from the API
        search_term (str): The search term that found this tweet
        
    Returns:
        dict: Processed tweet data
    """
    # Current timestamp
    current_time = datetime.now().isoformat()
    
    # Extract empty placeholders for sentiment fields (to be filled later)
    sentiment_label = ""
    sentiment_score = 0.0
    sentiment_magnitude = 0.0
    weighted_sentiment = 0.0
    
    # Return processed tweet data
    return {
        'tweet_id': tweet.get('id', ''),
        'tweet_text': tweet.get('text', ''),
        'created_at': tweet.get('createdAt', ''),
        'retweet_count': tweet.get('retweetCount', 0),
        'reply_count': tweet.get('replyCount', 0),
        'like_count': tweet.get('likeCount', 0),
        'quote_count': tweet.get('quoteCount', 0),
        'bookmark_count': tweet.get('bookmarkCount', 0),
        'lang': tweet.get('lang', ''),
        'is_reply': tweet.get('isReply', False),
        'is_quote': tweet.get('isQuote', False),
        'is_retweet': tweet.get('isRetweet', False),
        'url': tweet.get('url', ''),
        'search_term': search_term,
        'author_username': tweet.get('author', {}).get('userName', ''),
        'author_name': tweet.get('author', {}).get('name', ''),
        'author_verified': tweet.get('author', {}).get('isVerified', False),
        'author_blue_verified': tweet.get('author', {}).get('isBlueVerified', False),
        'author_followers': tweet.get('author', {}).get('followers', 0),
        'author_following': tweet.get('author', {}).get('following', 0),
        'sentiment_label': sentiment_label,
        'sentiment_score': sentiment_score,
        'sentiment_magnitude': sentiment_magnitude,
        'weighted_sentiment': weighted_sentiment,
        'collected_at': current_time
    }

def store_tweets_in_db(tweets):
    """
    Store processed tweets in the database
    
    Args:
        tweets (list): List of processed tweet dictionaries
        
    Returns:
        int: Number of tweets stored
    """
    stored_count = 0
    
    for tweet in tweets:
        try:
            db.store_tweets(
                tweet_id=tweet['tweet_id'],
                tweet_text=tweet['tweet_text'],
                created_at=tweet['created_at'],
                retweet_count=tweet['retweet_count'],
                reply_count=tweet['reply_count'],
                like_count=tweet['like_count'],
                quote_count=tweet['quote_count'],
                bookmark_count=tweet['bookmark_count'],
                lang=tweet['lang'],
                is_reply=tweet['is_reply'],
                is_quote=tweet['is_quote'],
                is_retweet=tweet['is_retweet'],
                url=tweet['url'],
                search_term=tweet['search_term'],
                author_username=tweet['author_username'],
                author_name=tweet['author_name'],
                author_verified=tweet['author_verified'],
                author_blue_verified=tweet['author_blue_verified'],
                author_followers=tweet['author_followers'],
                author_following=tweet['author_following'],
                sentiment_label=tweet['sentiment_label'],
                sentiment_score=tweet['sentiment_score'],
                sentiment_magnitude=tweet['sentiment_magnitude'],
                weighted_sentiment=tweet['weighted_sentiment'],
                collected_at=tweet['collected_at']
            )
            stored_count += 1
            
        except Exception as e:
            print(f"Error storing tweet {tweet['tweet_id']}: {e}")
    
    return stored_count

def fetch_and_store_tweets_by_period(search_terms, months_back=12, batch_size=10):
    """
    Fetch tweets for multiple search terms in monthly chunks and store in DB
    
    Args:
        search_terms (list): List of search terms
        months_back (int): Number of months to look back
        batch_size (int): Number of tweets per batch
        
    Returns:
        int: Total number of tweets stored
    """
    total_stored = 0
    
    # Calculate date ranges for each month
    end_date = datetime.now()
    
    # Process each month
    for month in range(months_back):
        month_end = end_date - timedelta(days=30 * month)
        month_start = end_date - timedelta(days=30 * (month + 1))
        
        end_str = month_end.strftime('%Y-%m-%d')
        start_str = month_start.strftime('%Y-%m-%d')
        
        print(f"\n======= Fetching month {month+1} of {months_back} ({start_str} to {end_str}) =======")
        
        # Process each search term
        for term in search_terms:
            print(f"Fetching tweets for term: {term}")
            
            # Fetch tweets for this term and date range
            term_tweets = fetch_tweets(
                search_terms=[term],
                start_date=start_str,
                end_date=end_str,
                max_items=batch_size,
                tweet_language="en",
                min_retweets=10,
                min_favorites=25,
                only_verified=True
            )
            
            if term_tweets:
                # Process tweets
                processed_tweets = [process_tweet(tweet, term) for tweet in term_tweets]
                
                # Store in database
                stored_count = store_tweets_in_db(processed_tweets)
                total_stored += stored_count
                
                print(f"Stored {stored_count} tweets for term '{term}' for period {start_str} to {end_str}")
            else:
                print(f"No tweets found for term '{term}' for period {start_str} to {end_str}")
            
            # Be nice to the API
            time.sleep(5)
    
    print(f"### Tweet fetching and storage completed. Total tweets stored: {total_stored} ###")
    return total_stored

# Define search terms for Apple
APPLE_SEARCH_TERMS = [
    "$AAPL OR Apple -from:Apple", 
    "Tim Cook", 
    "iPhone", 
    "WWDC OR (Apple event)",
    "MacBook OR Macbook OR (Mac Pro) OR iMac",
    "iPad OR iPadOS",
    "iOS OR iPadOS OR macOS",
    "Apple earnings OR (AAPL earnings)",
    "(Apple stock) OR (AAPL stock) OR (Apple shares)",
    "Apple AI OR (Apple intelligence)",
    "Apple Vision Pro",
    "Apple Watch",
    "AirPods OR (Apple headphones)"
]

# Execute the data collection - adjust parameters as needed
total_tweets = fetch_and_store_tweets_by_period(
    search_terms=APPLE_SEARCH_TERMS,
    months_back=1,  # Collect 1 year of data
    batch_size=5   # 500 tweets per month per term
)

print(f"Data collection complete. Total tweets collected: {total_tweets}")

Connected to database: ../Data/data.db
Using database at: C:\Users\kemoo\PycharmProjects\Stock_AI_Predictor\Data\data.db
APIFY_API_TOKEN: apify_api_TrvSNoUgTUBDan9KL95e8iszphaLNY2A1FJG

Fetching tweets for term: $AAPL OR Apple -from:Apple
Fetching tweets for search term: $AAPL OR Apple -from:Apple
Error fetching tweets for term '$AAPL OR Apple -from:Apple': Monthly usage hard limit exceeded
No tweets found for term '$AAPL OR Apple -from:Apple' for period 2025-03-15 to 2025-04-14


KeyboardInterrupt: 