<a href="https://colab.research.google.com/github/vvvvvvss/StockMarketManupilationSystem/blob/main/Stock_Manupilation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Real-time data processing and analysis

In [None]:
pip install aiohttp pandas confluent-kafka



Trying to see what kind of data can be fetched from the API: Alphavantage

In [None]:
import requests
import pandas as pd
import time

API_KEY = "QT13WY791JO16QMJ"
BASE_URL = "https://www.alphavantage.co/query"

def fetch_stock_data(symbol, interval="5min"):
    params = {
        "function": "TIME_SERIES_INTRADAY",
        "symbol": symbol,
        "interval": interval,
        "apikey": API_KEY,
        "outputsize": "compact"
    }
    response = requests.get(BASE_URL, params=params)
    data = response.json()

    if "Error Message" in data:
        print(f"Error fetching data for {symbol}: {data['Error Message']}")
        return None
    elif f"Time Series ({interval})" in data:
        time_series = data[f"Time Series ({interval})"]
        df = pd.DataFrame.from_dict(time_series, orient="index")
        df.reset_index(inplace=True)
        df.rename(columns={"index": "timestamp"}, inplace=True)
        return df
    else:
        print(f"Unexpected data format for {symbol}: {data}")
        return None

stock_data = fetch_stock_data("AAPL")
if stock_data is not None:
    print(stock_data.head())
else:
    print("Could not retrieve stock data.")

             timestamp   1. open   2. high    3. low  4. close 5. volume
0  2025-03-25 19:55:00  224.1300  224.3300  224.1000  224.2400      3316
1  2025-03-25 19:50:00  224.2000  224.3300  224.0700  224.1000       738
2  2025-03-25 19:45:00  224.1500  224.3300  224.0700  224.2800      2743
3  2025-03-25 19:40:00  224.1500  224.1500  224.0700  224.0700       834
4  2025-03-25 19:35:00  224.1000  224.1500  224.0500  224.0701       546


#    Data Collection - Rough

1.   Fetch trading data from Alpha Vantage
2.   Detect potential market manipulation using Isolation Forest
3.   Mock implementation of social media sentiment collection



In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
import requests

class MarketManipulationDetector:
    def __init__(self, alpha_vantage_key):
        self.alpha_vantage_key = alpha_vantage_key
        self.trading_data = None
        self.sentiment_data = None

    def fetch_trading_data(self, symbol):
        url = f"https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol={symbol}&apikey={self.alpha_vantage_key}"
        response = requests.get(url)

        if response.status_code == 200:
            raw_data = response.json()
            self.trading_data = pd.DataFrame.from_dict(
                raw_data.get('Time Series (Daily)', {}),
                orient='index'
            )
            self.trading_data.columns = [
                'open', 'high', 'low', 'close', 'volume'
            ]
            self.trading_data = self.trading_data.astype(float)

    def detect_anomalous_trading(self):
        if self.trading_data is None:
            raise ValueError("Trading data not loaded")


        features = ['volume', 'close']
        X = self.trading_data[features]

        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        clf = IsolationForest(contamination=0.1, random_state=42)
        y_pred = clf.fit_predict(X_scaled)
        self.trading_data['is_anomaly'] = y_pred == -1

        return self.trading_data[self.trading_data['is_anomaly']]

    def collect_social_sentiment(self, symbol):
        #  without StockTwits API

        fake_sentiments = {
            'bullish': 0.6,
            'bearish': 0.3,
            'neutral': 0.1
        }
        return fake_sentiments

def main():

    detector = MarketManipulationDetector(alpha_vantage_key='QT13WY791JO16QMJ')
    detector.fetch_trading_data('INFY')

    anomalies = detector.detect_anomalous_trading()
    print("Potential Manipulative Trading Days:")
    print(anomalies)


    sentiment = detector.collect_social_sentiment('INFY')
    print("\nSocial Media Sentiment:")
    print(sentiment)

if __name__ == "__main__":
    main()

Potential Manipulative Trading Days:
             open    high      low  close      volume  is_anomaly
2025-03-28  18.37  18.482  18.1050  18.17   7866062.0        True
2025-03-27  18.70  18.780  18.5950  18.67   6249534.0        True
2025-03-21  18.41  18.430  18.1700  18.32  18677618.0        True
2025-03-20  18.33  18.390  17.9001  18.06  19376214.0        True
2025-03-13  18.50  18.585  18.2600  18.29  10913566.0        True
2025-03-12  18.49  18.645  18.3400  18.50  15292391.0        True
2025-03-11  19.13  19.200  18.8100  18.97  17695135.0        True
2025-01-16  22.60  22.600  21.3100  21.57  22922717.0        True
2024-12-19  23.18  23.620  23.1000  23.42   9178696.0        True
2024-12-13  23.52  23.630  23.2800  23.40   4443501.0        True

Social Media Sentiment:
{'bullish': 0.6, 'bearish': 0.3, 'neutral': 0.1}


In [None]:
!pip install alpha_vantage

Collecting alpha_vantage
  Downloading alpha_vantage-3.0.0-py3-none-any.whl.metadata (12 kB)
Downloading alpha_vantage-3.0.0-py3-none-any.whl (35 kB)
Installing collected packages: alpha_vantage
Successfully installed alpha_vantage-3.0.0


# Main code

In [None]:
import requests
import pandas as pd
import json
from textblob import TextBlob
from alpha_vantage.timeseries import TimeSeries


#stock data collection
ALPHA_VANTAGE_API_KEY = "ED3T9IQN5OD495QC"
STOCK_SYMBOL = "AAPL"

ts = TimeSeries(key=ALPHA_VANTAGE_API_KEY, output_format='pandas')
data, meta_data = ts.get_daily(symbol=STOCK_SYMBOL, outputsize='compact')


data.to_csv("stock_data.csv") # storing stock data as a CSV file
print("Stock data saved successfully.")

# StockTwits Data
def fetch_stocktwits_data(symbol):
    url = f"https://api.stocktwits.com/api/2/streams/symbol/{symbol}.json"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        return None

def analyze_sentiment(messages):
    sentiments = []
    for msg in messages:
        text = msg['body']
        sentiment = TextBlob(text).sentiment.polarity
        sentiments.append({'timestamp': msg['created_at'], 'text': text, 'sentiment_score': sentiment})
    return sentiments

stocktwits_data = fetch_stocktwits_data("TCS")
if stocktwits_data:
    messages = stocktwits_data['messages']
    sentiment_analysis = analyze_sentiment(messages)
    df_sentiment = pd.DataFrame(sentiment_analysis)
    df_sentiment.to_csv("sentiment_data.csv", index=False)
    print("Sentiment data saved successfully.")
else:
    print("Failed to fetch StockTwits data.")


def analyze_news_sentiment(news_text):
    return TextBlob(news_text).sentiment.polarity

news_text_sample = "Stock markets rally as tech stocks soar."
print("Sample News Sentiment Score:", analyze_news_sentiment(news_text_sample))


Stock data saved successfully.
Failed to fetch StockTwits data.
Sample News Sentiment Score: 0.0


In [None]:
!pip install feedparser

Collecting feedparser
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting sgmllib3k (from feedparser)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6047 sha256=208730de486f11e99ae97bbc567df4a5a2853405ec2ad5fb060084893a8550ad
  Stored in directory: /root/.cache/pip/wheels/3b/25/2a/105d6a15df6914f4d15047691c6c28f9052cc1173e40285d03
Successfully built sgmllib3k
Installing collected packages: sgmllib3k, feedparser
Successfully installed feedparser-6.0.11 sgmllib3k-1.0.0


# Data Collection
1. Fetch daily stock data using Alpha Vantage API
2. Analyze sentiment of messages
3. Analyze sentiment of news text



In [None]:
import requests
import pandas as pd
import json
from textblob import TextBlob
from alpha_vantage.timeseries import TimeSeries
import feedparser

ALPHA_VANTAGE_API_KEY = "ED3T9IQN5OD495QC"
STOCK_SYMBOL = "AAPL"
STOCKTWITS_API_URL = "https://api.stocktwits.com/api/2/streams/symbol/{symbol}.json"

def fetch_stock_data(symbol, api_key):
    try:
        ts = TimeSeries(key=api_key, output_format='pandas')
        data, meta_data = ts.get_daily(symbol=symbol, outputsize='compact')
        data.to_csv("stock_data.csv")
        print(f"\nStock data for {symbol} saved successfully.")
        return data
    except Exception as e:
        print(f"Error fetching stock data: {e}")
        return None

def fetch_stocktwits_data(symbol):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        url = STOCKTWITS_API_URL.format(symbol=symbol)
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            data = response.json()
            print(f"Successfully fetched StockTwits data for {symbol}")
            return data
        else:
            print(f"Failed to fetch StockTwits data. Status code: {response.status_code}")
            print(f"Response content: {response.text}")
            return None
    except Exception as e:
        print(f"Error in fetching StockTwits data: {e}")
        return None

def analyze_sentiment(messages):
    sentiments = []
    for msg in messages:
        text = msg.get('body', '')
        sentiment = TextBlob(text).sentiment.polarity
        sentiments.append({
            'timestamp': msg.get('created_at', 'N/A'),
            'text': text,
            'sentiment_score': sentiment
        })
    return sentiments

def get_google_news_rss(stock_name):
    url = f"https://news.google.com/rss/search?q={stock_name}+stock"
    feed = feedparser.parse(url)

    news_list = []
    for entry in feed.entries[:5]:  # Fetch top 5 news articles
        news_list.append({"title": entry.title, "link": entry.link})

    return news_list

news_data = get_google_news_rss("TCS")
for news in news_data:
    print("\n",news["title"], "-", news["link"])

def analyze_news_sentiment(news_data):
    return TextBlob(news_data).sentiment.polarity

def main():
    stock_data = fetch_stock_data(STOCK_SYMBOL, ALPHA_VANTAGE_API_KEY)
    stocktwits_data = fetch_stocktwits_data(STOCK_SYMBOL)

    if stocktwits_data and 'messages' in stocktwits_data:
        sentiment_analysis = analyze_sentiment(stocktwits_data['messages'])
        df_sentiment = pd.DataFrame(sentiment_analysis)
        df_sentiment.to_csv("sentiment_data.csv", index=False)
        print("Sentiment data saved successfully.")
    else:
        print("No messages found in StockTwits data.")
    news_text_sample = "Stock markets rally as tech stocks soar."
    print("Sample News Sentiment Score:", analyze_news_sentiment(news_text_sample))

if __name__ == "__main__":
    main()


 TCS, Infosys, HDFC Bank, HCL Tech among 5 key stocks to declare interim dividend in April 2025 - Mint - https://news.google.com/rss/articles/CBMi8wFBVV95cUxQOXJ0TEd1UWVmTXRSS0UxYThjNnJjaS1SWWJReGdLS1RYMVAwR2pwaE00OExTQzhKeFdUckVYTlh3dFNZQ19vQkV2TG5zdVRqd19ZVXVaRlF4TFk4MGZBR2EtV3YzZXI5RkNzU01icHZCbVo0YTdhemstVU1EN2wwaUdUYnFINk5DRTlsbk95UTJfeFQ0WUYyVkxFRFZKaG5Cc212QVFycjJxVFZVRkZuN09oQjBPRF9fTmc5RzlrZG1tMW1HMjJuUHpHWWJSU3RtT2NsOWpPbkJtQ3hxMUJWRTA5c0lnLTE3WXhGS21tXzQ3OUXSAfgBQVVfeXFMTXd6aFRBUEROLW95SDdmeVU0dldqREJxdGpIOUJYV2x4XzFYS21zNlNZa3JmR1o5ZXpYVEQxSVB3NVVBblRWb0haenFVSGJqdHFONzllUlFXcVhINlplN2Vicm5kQS1oZlZIbFo5ZHltNVVscDBIdXl0Nm5EY1Zud3UtRmZMQmF4SHZlaWo4NGdCVEFDU1RyNlUtcDE2cGxTdGdfR01TdGFrTG8wVU93TlhLTktFNXVHZE50ejN0VUNTNnNxRzh4RmlwejJQNkdoTmNFUWtYZTYtcmR3cGdhazdibnJ6a0pmS3pyR0s1TGU3WHhiZTE3Tjg?oc=5

 32% target price slash! Goldman flags big risks for TCS, Infosys & other IT stocks amid US worries - The Economic Times - https://news.google.com/rss/articles/CBMi3AFBVV95cUxPdVdENUl

# April Monthly Progress

In [None]:
!pip install pandas numpy requests matplotlib seaborn scikit-learn xgboost nltk tweepy newspaper3k

Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jieba3k>=0.35.1 (from newspaper3k)
  Downloading jieba3k-0.35.1.zip (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m66.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tinysegmenter==0.3 (from newspaper3k)
  Downloading tinysegmenter-0.3.tar.gz (16 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collec

In [1]:
!pip install newspaper3k

Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jieba3k>=0.35.1 (from newspaper3k)
  Downloading jieba3k-0.35.1.zip (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tinysegmenter==0.3 (from newspaper3k)
  Downloading tinysegmenter-0.3.tar.gz (16 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collec

In [2]:
!pip install lxml_html_clean

Collecting lxml_html_clean
  Downloading lxml_html_clean-0.4.2-py3-none-any.whl.metadata (2.4 kB)
Downloading lxml_html_clean-0.4.2-py3-none-any.whl (14 kB)
Installing collected packages: lxml_html_clean
Successfully installed lxml_html_clean-0.4.2


In [6]:
!pip install --upgrade nltk



In [None]:
import nltk
nltk.download('vader_lexicon')  # Download the VADER lexicon for sentiment analysis

In [7]:
# import libraies
import pandas as pd
import numpy as np
import requests
import time
import json
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import re
from sklearn.ensemble import IsolationForest, RandomForestClassifier
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
from sklearn.metrics import precision_score, recall_score, f1_score
import tweepy
import newspaper
from newspaper import Article
import warnings
warnings.filterwarnings('ignore')

# installing sentiment analyzer model
nltk.download('vader_lexicon', quiet=True)
# initializing this instance
sia = SentimentIntensityAnalyzer()

# Historical Stock Data Setup
ALPHA_VANTAGE_API_KEY = "PIG3WPABVKTBMH6Y"

# Twitter API setup
TWITTER_API_KEY = "ZkBtakhypMnFkI4dUzVo0QJTw"
TWITTER_API_SECRET = "P1gqWUJsOFkjmkOsPuyB458xi8bwo4KZ1Cy0LbGXPQLxcR3v79"
TWITTER_ACCESS_TOKEN = "1916142259252432899-t6yhBBktXrrexsMqb0DOR9ZgVDJJTB"
TWITTER_ACCESS_SECRET = "L29j1XSBBYhJ614Ev2cE7Ukl3a0Vs5iwMxC9iyOEOJ3oR"

# Authenticate with Twitter API
auth = tweepy.OAuthHandler(TWITTER_API_KEY, TWITTER_API_SECRET)
auth.set_access_token(TWITTER_ACCESS_TOKEN, TWITTER_ACCESS_SECRET)
twitter_api = tweepy.API(auth)

# Class for stock manipulation detection
class StockManipulationDetector:
    def __init__(self, ticker_symbol, lookback_days=30):
        self.ticker = ticker_symbol
        self.lookback_days = lookback_days
        self.stock_data = None
        self.tweets = None
        self.news = None
        self.anomaly_model = None
        self.manipulation_model = None
        self.scaler = StandardScaler()

    def fetch_stock_data(self):
        """Fetch historical stock data from Alpha Vantage"""
        print(f"Fetching stock data for {self.ticker}...")

        # Daily data
        url = f"https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol={self.ticker}&outputsize=full&apikey={ALPHA_VANTAGE_API_KEY}"
        response = requests.get(url)
        data = response.json()

        if "Time Series (Daily)" not in data:
            print("Error fetching stock data. API response:", data)
            return False

        df = pd.DataFrame(data["Time Series (Daily)"]).T
        df.columns = ["open", "high", "low", "close", "volume"]
        df = df.astype(float)
        df.index = pd.to_datetime(df.index)
        df = df.sort_index()

        # Calculate additional features
        df['price_change'] = df['close'].pct_change()
        df['volume_change'] = df['volume'].pct_change()
        df['high_low_diff'] = df['high'] - df['low']
        df['volatility'] = df['price_change'].rolling(window=5).std()

        # Rolling statistics
        df['price_mean_5d'] = df['close'].rolling(window=5).mean()
        df['volume_mean_5d'] = df['volume'].rolling(window=5).mean()
        df['price_std_5d'] = df['close'].rolling(window=5).std()
        df['volume_std_5d'] = df['volume'].rolling(window=5).std()

        # Z-scores for anomaly detection
        df['price_z_score'] = (df['close'] - df['price_mean_5d']) / df['price_std_5d']
        df['volume_z_score'] = (df['volume'] - df['volume_mean_5d']) / df['volume_std_5d']

        # Momentum indicators
        df['price_momentum'] = df['close'] - df['close'].shift(5)
        df['volume_momentum'] = df['volume'] - df['volume'].shift(5)

        # Filter to relevant period and drop NAs
        df = df.iloc[-self.lookback_days*2:]
        df = df.fillna(0)

        self.stock_data = df
        print(f"Fetched {len(df)} days of stock data")
        return True

    # Simplified Twitter sentiment collection using text files or CSV instead of API
    def fetch_tweets_alternative(self, ticker):
        """Alternative method when Twitter API is unavailable"""
        print(f"Using alternative sentiment data for {ticker}...")

        # Create synthetic sentiment data based on stock price movements
        # This is a fallback when real Twitter data isn't available
        if self.stock_data is not None:
            dates = self.stock_data.index[-30:]  # Last 30 days

            # Create synthetic tweet sentiment that somewhat follows price changes
            # but with some randomness and lag
            price_changes = self.stock_data['price_change'].values[-32:-2]  # Lagged by 2 days

            synthetic_tweet_data = []

            for i, date in enumerate(dates):
                # Base sentiment on lagged price changes with noise
                base_sentiment = price_changes[i] * 5  # Scale up for sentiment range
                sentiment = min(max(base_sentiment + np.random.normal(0, 0.3), -1), 1)  # Bound between -1 and 1

                # Create more tweets on volatile days
                tweet_count = int(50 + abs(sentiment) * 200 + np.random.normal(0, 20))
                tweet_count = max(10, tweet_count)  # At least 10 tweets

                synthetic_tweet_data.append({
                    'date': date,
                    'tweet_sentiment_mean': sentiment,
                    'tweet_sentiment_std': 0.3 + abs(sentiment) * 0.2,
                    'tweet_count': tweet_count,
                    'retweet_count': tweet_count * 3,
                    'favorite_count': tweet_count * 5
                })

            self.tweets = pd.DataFrame(synthetic_tweet_data)
            self.tweets.set_index('date', inplace=True)
            print(f"Created synthetic sentiment data for {ticker}")
            return True

        return False

    # Simplified news collection using text files or CSV instead of web scraping
    def fetch_news_alternative(self, ticker):
        """Alternative method when news scraping is blocked"""
        print(f"Using alternative news data for {ticker}...")

        # Create synthetic news data based on stock movements
        if self.stock_data is not None:
            dates = self.stock_data.index[-30:]  # Last 30 days

            # Create news sentiment that somewhat follows price trends
            # but with occasional contrarian articles
            synthetic_news_data = []

            for date in dates:
                # Get price data for this date if available
                if date in self.stock_data.index:
                    price_change = self.stock_data.loc[date, 'price_change']

                    # Occasionally have contrarian news
                    contrarian = np.random.random() > 0.7

                    if contrarian:
                        # News sentiment opposite to price movement
                        sentiment = -price_change * 3
                    else:
                        # News sentiment aligned with price movement
                        sentiment = price_change * 3

                    sentiment = min(max(sentiment + np.random.normal(0, 0.2), -1), 1)

                    # More news on days with bigger price moves
                    news_count = int(2 + abs(price_change) * 20 + np.random.normal(0, 1))
                    news_count = max(1, news_count)  # At least 1 news item

                    synthetic_news_data.append({
                        'date': date,
                        'news_sentiment_mean': sentiment,
                        'news_sentiment_std': 0.2,
                        'news_count': news_count
                    })

            self.news = pd.DataFrame(synthetic_news_data)
            self.news.set_index('date', inplace=True)
            print(f"Created synthetic news data for {ticker}")
            return True

        return False

    def analyze_sentiment(self):
        """Analyze sentiment from tweets and news, aggregate by day"""
        print("Analyzing sentiment data...")

        # Create date ranges for the period we're analyzing
        end_date = datetime.now().date()
        start_date = end_date - timedelta(days=self.lookback_days)
        date_range = pd.date_range(start=start_date, end=end_date)

        # Initialize sentiment DataFrames
        sentiment_daily = pd.DataFrame(index=date_range)

        # Process tweets sentiment
        if self.tweets is not None and not self.tweets.empty:
            # Convert to datetime and extract date
            self.tweets['date'] = self.tweets['created_at'].dt.date

            # Group by date and calculate metrics
            tweet_sentiment = self.tweets.groupby('date').agg({
                'sentiment': ['mean', 'std', 'count'],
                'retweet_count': 'sum',
                'favorite_count': 'sum'
            })

            tweet_sentiment.columns = ['tweet_sentiment_mean', 'tweet_sentiment_std',
                                     'tweet_count', 'retweet_count', 'favorite_count']

            # Convert index to datetime for proper joining
            tweet_sentiment.index = pd.to_datetime(tweet_sentiment.index)

            # Join with main sentiment DataFrame
            sentiment_daily = sentiment_daily.join(tweet_sentiment)

        # Process news sentiment
        if self.news is not None and not self.news.empty:
            # Convert to datetime and extract date
            self.news['date'] = self.news['published_date'].dt.date

            # Group by date and calculate metrics
            news_sentiment = self.news.groupby('date').agg({
                'sentiment': ['mean', 'std', 'count']
            })

            news_sentiment.columns = ['news_sentiment_mean', 'news_sentiment_std', 'news_count']

            # Convert index to datetime for proper joining
            news_sentiment.index = pd.to_datetime(news_sentiment.index)

            # Join with main sentiment DataFrame
            sentiment_daily = sentiment_daily.join(news_sentiment)

        # Fill NaN values with 0 for calculation purposes
        sentiment_daily = sentiment_daily.fillna(0)

        # Calculate additional metrics
        if 'tweet_sentiment_mean' in sentiment_daily.columns:
            sentiment_daily['tweet_sentiment_zscore'] = (
                sentiment_daily['tweet_sentiment_mean'] -
                sentiment_daily['tweet_sentiment_mean'].rolling(window=5).mean()
            ) / sentiment_daily['tweet_sentiment_mean'].rolling(window=5).std().replace(0, 1)

        if 'news_sentiment_mean' in sentiment_daily.columns:
            sentiment_daily['news_sentiment_zscore'] = (
                sentiment_daily['news_sentiment_mean'] -
                sentiment_daily['news_sentiment_mean'].rolling(window=5).mean()
            ) / sentiment_daily['news_sentiment_mean'].rolling(window=5).std().replace(0, 1)

        # Add sentiment momentum (change from previous day)
        for col in ['tweet_sentiment_mean', 'news_sentiment_mean']:
            if col in sentiment_daily.columns:
                sentiment_daily[f'{col}_change'] = sentiment_daily[col].diff()

        # Add volume change metrics
        for col in ['tweet_count', 'news_count']:
            if col in sentiment_daily.columns:
                sentiment_daily[f'{col}_change'] = sentiment_daily[col].pct_change()
                sentiment_daily[f'{col}_zscore'] = (
                    sentiment_daily[col] -
                    sentiment_daily[col].rolling(window=5).mean()
                ) / sentiment_daily[col].rolling(window=5).std().replace(0, 1)

        return sentiment_daily

    def integrate_data(self):
        """Integrate stock data with sentiment analysis"""
        print("Integrating market and sentiment data...")

        # Get sentiment data
        sentiment_daily = self.analyze_sentiment()

        # Make sure stock_data index is datetime
        self.stock_data.index = pd.to_datetime(self.stock_data.index)

        # Merge sentiment with stock data
        merged_data = self.stock_data.join(sentiment_daily, how='left')

        # Fill missing values
        merged_data = merged_data.fillna(0)

        # Calculate correlations between sentiment and price/volume changes
        # These correlations can help identify manipulation
        if 'tweet_sentiment_mean' in merged_data.columns:
            merged_data['tweet_price_corr'] = merged_data['tweet_sentiment_mean'].rolling(window=5).corr(merged_data['price_change'])
            merged_data['tweet_volume_corr'] = merged_data['tweet_sentiment_mean'].rolling(window=5).corr(merged_data['volume_change'])

        if 'news_sentiment_mean' in merged_data.columns:
            merged_data['news_price_corr'] = merged_data['news_sentiment_mean'].rolling(window=5).corr(merged_data['price_change'])
            merged_data['news_volume_corr'] = merged_data['news_sentiment_mean'].rolling(window=5).corr(merged_data['volume_change'])

        # Add features that might indicate manipulation
        # 1. Abnormal price changes with high sentiment but low news (pump)
        if 'tweet_count' in merged_data.columns and 'news_count' in merged_data.columns:
            merged_data['pump_indicator'] = (
                (merged_data['price_z_score'] > 1.5) &
                (merged_data['tweet_sentiment_zscore'] > 1.5) &
                (merged_data['news_count'] < merged_data['news_count'].mean())
            ).astype(int)

        # 2. High volume with negative sentiment divergence (dump)
        if 'tweet_sentiment_zscore' in merged_data.columns:
            merged_data['dump_indicator'] = (
                (merged_data['volume_z_score'] > 1.5) &
                (merged_data['price_change'] < 0) &
                (merged_data['tweet_sentiment_zscore'] < -1.5)
            ).astype(int)

        # Keep only the most recent lookback days
        merged_data = merged_data.iloc[-self.lookback_days:]

        return merged_data

    def train_anomaly_model(self, data):
        """Train isolation forest model for anomaly detection"""
        print("Training anomaly detection model...")

        # Select features for anomaly detection
        feature_cols = [
            'price_z_score', 'volume_z_score', 'volatility',
            'high_low_diff', 'price_momentum', 'volume_momentum'
        ]

        # Add sentiment features if available
        sentiment_features = [
            'tweet_sentiment_zscore', 'news_sentiment_zscore',
            'tweet_count_zscore', 'news_count_zscore',
            'tweet_price_corr', 'news_price_corr'
        ]

        for feature in sentiment_features:
            if feature in data.columns:
                feature_cols.append(feature)

        # Get feature subset that exists in the data
        valid_features = [col for col in feature_cols if col in data.columns]

        if not valid_features:
            print("No valid features found for anomaly detection")
            return False

        # Extract features
        X = data[valid_features].fillna(0)

        # Scale features
        X_scaled = self.scaler.fit_transform(X)

        # Train isolation forest
        self.anomaly_model = IsolationForest(
            n_estimators=100,
            contamination=0.05,
            random_state=42
        )

        self.anomaly_model.fit(X_scaled)

        # Add anomaly scores to the data
        data['anomaly_score'] = self.anomaly_model.decision_function(X_scaled)
        data['is_anomaly'] = self.anomaly_model.predict(X_scaled)

        # Convert prediction to binary (1 for normal, -1 for anomaly)
        data['is_anomaly'] = (data['is_anomaly'] == -1).astype(int)

        return data

    def train_manipulation_model(self, data):
        """Train XGBoost model to classify potential manipulation"""
        print("Training manipulation detection model...")

        # For a real system, you would have labeled data of known manipulation cases
        # Since we don't have labels, we'll create synthetic ones based on our indicators

        # Define manipulation as days with anomalies and either pump or dump indicators
        if 'pump_indicator' in data.columns and 'dump_indicator' in data.columns:
            data['potential_manipulation'] = (
                (data['is_anomaly'] == 1) &
                ((data['pump_indicator'] == 1) | (data['dump_indicator'] == 1))
            ).astype(int)
        else:
            # Fallback to just anomalies if we don't have the indicators
            data['potential_manipulation'] = data['is_anomaly']

        # Select features for the classifier
        feature_cols = [
            'price_z_score', 'volume_z_score', 'volatility',
            'high_low_diff', 'price_momentum', 'volume_momentum'
        ]

        # Add sentiment features if available
        sentiment_features = [
            'tweet_sentiment_mean', 'news_sentiment_mean',
            'tweet_count', 'news_count',
            'tweet_sentiment_zscore', 'news_sentiment_zscore',
            'tweet_price_corr', 'news_price_corr'
        ]

        for feature in sentiment_features:
            if feature in data.columns:
                feature_cols.append(feature)

        # Get feature subset that exists in the data
        valid_features = [col for col in feature_cols if col in data.columns]

        if not valid_features:
            print("No valid features found for manipulation model")
            return data

        # Extract features and target
        X = data[valid_features].fillna(0)
        y = data['potential_manipulation']

        # Scale features
        X_scaled = self.scaler.fit_transform(X)

        # Train XGBoost model
        self.manipulation_model = xgb.XGBClassifier(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=3,
            random_state=42,
            use_label_encoder=False,
            eval_metric='logloss'
        )

        try:
            # Only train if we have both classes represented
            if len(y.unique()) > 1:
                self.manipulation_model.fit(X_scaled, y)

                # Add predictions to the data
                data['manipulation_probability'] = self.manipulation_model.predict_proba(X_scaled)[:, 1]
                data['predicted_manipulation'] = self.manipulation_model.predict(X_scaled)

                # Feature importance
                feature_importance = pd.DataFrame({
                    'feature': valid_features,
                    'importance': self.manipulation_model.feature_importances_
                }).sort_values('importance', ascending=False)

                print("\nTop manipulation indicators:")
                print(feature_importance.head(5))
            else:
                print("Not enough variation in the target variable to train classifier")
                data['manipulation_probability'] = 0
                data['predicted_manipulation'] = 0

        except Exception as e:
            print(f"Error training manipulation model: {e}")
            data['manipulation_probability'] = 0
            data['predicted_manipulation'] = 0

        return data

    def detect_manipulation(self, demo_mode=True):
        """Main method to run the entire detection pipeline"""
        # Fetch data
        if not self.fetch_stock_data():
            print("Failed to fetch stock data. Aborting.")
            return None

        if demo_mode:
            # Use alternative data sources that don't require APIs
            self.fetch_tweets_alternative(self.ticker)
            self.fetch_news_alternative(self.ticker)
        else:
            # Try to use actual APIs (may fail with current limitations)
            self.fetch_tweets()
            self.fetch_news()

    # Rest of the method remains the same...

    def display_results(self, data):
        """Display detection results and visualizations"""
        if data is None or data.empty:
            print("No data available to display results")
            return

        # Print summary of detected manipulations
        print("\n----- MANIPULATION DETECTION SUMMARY -----")

        # Filter to just the most recent period
        recent_data = data.iloc[-self.lookback_days:]

        # Count days with potential manipulation
        if 'predicted_manipulation' in recent_data.columns:
            manipulation_days = recent_data[recent_data['predicted_manipulation'] == 1]
            n_manipulation_days = len(manipulation_days)

            print(f"Detected potential manipulation on {n_manipulation_days} days out of {len(recent_data)} analyzed.")

            if n_manipulation_days > 0:
                print("\nDates with suspected manipulation:")
                for date, row in manipulation_days.iterrows():
                    features = []

                    # Add indicators that triggered the alert
                    if row['price_z_score'] > 1.5:
                        features.append(f"Abnormal price (z={row['price_z_score']:.2f})")
                    if row['volume_z_score'] > 1.5:
                        features.append(f"Abnormal volume (z={row['volume_z_score']:.2f})")
                    if 'tweet_sentiment_zscore' in row and row['tweet_sentiment_zscore'] > 1.5:
                        features.append(f"Abnormal social sentiment (z={row['tweet_sentiment_zscore']:.2f})")
                    if 'pump_indicator' in row and row['pump_indicator'] == 1:
                        features.append("Pump pattern")
                    if 'dump_indicator' in row and row['dump_indicator'] == 1:
                        features.append("Dump pattern")

                    print(f"  {date.date()}: {', '.join(features)}")
        else:
            print("Manipulation classification not available.")

        # Plot results
        try:
            self.plot_results(data)
        except Exception as e:
            print(f"Error creating plots: {e}")

    def plot_results(self, data):
        """Create visualizations of the detection results"""
        fig, axes = plt.subplots(3, 1, figsize=(14, 18), sharex=True)

        # Stock price with anomaly highlighting
        ax0 = axes[0]
        ax0.set_title(f"{self.ticker} Stock Price with Anomaly Detection", fontsize=14)
        ax0.plot(data.index, data['close'], label='Close Price', color='blue')

        # Highlight anomalies if available
        if 'is_anomaly' in data.columns:
            anomaly_days = data[data['is_anomaly'] == 1]
            ax0.scatter(anomaly_days.index, anomaly_days['close'],
                      color='red', label='Anomalies', zorder=5)

        # Highlight manipulation if available
        if 'predicted_manipulation' in data.columns:
            manip_days = data[data['predicted_manipulation'] == 1]
            ax0.scatter(manip_days.index, manip_days['close'],
                      color='darkred', marker='X', s=100,
                      label='Potential Manipulation', zorder=10)

        ax0.set_ylabel('Price ($)')
        ax0.legend()
        ax0.grid(True, alpha=0.3)

        # Volume plot
        ax1 = axes[1]
        ax1.set_title(f"{self.ticker} Trading Volume", fontsize=14)
        ax1.bar(data.index, data['volume'], color='green', alpha=0.7, label='Volume')

        # Highlight volume anomalies
        volume_anomalies = data[data['volume_z_score'] > 1.5]
        ax1.bar(volume_anomalies.index, volume_anomalies['volume'], color='orange', label='Volume Anomalies')

        ax1.set_ylabel('Volume')
        ax1.legend()
        ax1.grid(True, alpha=0.3)

        # Sentiment plot
        ax2 = axes[2]
        ax2.set_title("Sentiment Analysis", fontsize=14)

        if 'tweet_sentiment_mean' in data.columns:
            ax2.plot(data.index, data['tweet_sentiment_mean'],
                   label='Social Sentiment', color='purple')

        if 'news_sentiment_mean' in data.columns:
            ax2.plot(data.index, data['news_sentiment_mean'],
                   label='News Sentiment', color='brown')

        # Add zero line
        ax2.axhline(y=0, color='gray', linestyle='-', alpha=0.5)

        ax2.set_ylabel('Sentiment Score')
        ax2.set_xlabel('Date')
        ax2.legend()
        ax2.grid(True, alpha=0.3)

        plt.tight_layout()
        plt.savefig(f"{self.ticker}_manipulation_analysis.png")
        plt.close()

        print(f"\nSaved visualization to {self.ticker}_manipulation_analysis.png")

def test_detector(ticker_symbol, demo_mode=True):
    print(f"\n===== ANALYZING {ticker_symbol} =====")
    detector = StockManipulationDetector(ticker_symbol, lookback_days=30)
    results = detector.detect_manipulation(demo_mode=demo_mode)
    return results
def evaluate_models(self, data):
    """Evaluate the performance of the anomaly and manipulation detection models"""
    print("\n----- MODEL EVALUATION -----")

    # For evaluation, we'll use our synthetic labels as "ground truth"
    # In a real-world scenario, you would need human-labeled examples

    # Evaluate anomaly detection
    if 'is_anomaly' in data.columns and 'potential_manipulation' in data.columns:
        # Use potential_manipulation as a proxy for ground truth
        y_true = data['potential_manipulation']
        y_pred = data['is_anomaly']

        precision = precision_score(y_true, y_pred, zero_division=0)
        recall = recall_score(y_true, y_pred, zero_division=0)
        f1 = f1_score(y_true, y_pred, zero_division=0)

        print("\nAnomaly Detection Performance:")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")

    # Evaluate manipulation detection
    if 'predicted_manipulation' in data.columns and 'potential_manipulation' in data.columns:
        # Compare predictions against our synthetic "ground truth"
        y_true = data['potential_manipulation']
        y_pred = data['predicted_manipulation']

        precision = precision_score(y_true, y_pred, zero_division=0)
        recall = recall_score(y_true, y_pred, zero_division=0)
        f1 = f1_score(y_true, y_pred, zero_division=0)

        print("\nManipulation Detection Performance:")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")

        # Calculate accuracy
        accuracy = (y_true == y_pred).mean()
        print(f"Accuracy: {accuracy:.4f}")

        # Return the overall accuracy
        return accuracy

    return None

# Main function
def main():
    # Test with a few stocks known for volatility and social media attention
    tickers = ["GME", "AMC", "TSLA", "AAPL"]

    for ticker in tickers:
        test_detector(ticker)
        # Add delay between API calls to avoid rate limits
        time.sleep(2)

if __name__ == "__main__":
    main()

AttributeError: partially initialized module 'nltk' has no attribute 'data' (most likely due to a circular import)

In [None]:
def evaluate_models(self, data):
    """Evaluate the performance of the anomaly and manipulation detection models"""
    print("\n----- MODEL EVALUATION -----")

    # For evaluation, we'll use our synthetic labels as "ground truth"
    # In a real-world scenario, you would need human-labeled examples

    # Evaluate anomaly detection
    if 'is_anomaly' in data.columns and 'potential_manipulation' in data.columns:
        # Use potential_manipulation as a proxy for ground truth
        y_true = data['potential_manipulation']
        y_pred = data['is_anomaly']

        precision = precision_score(y_true, y_pred, zero_division=0)
        recall = recall_score(y_true, y_pred, zero_division=0)
        f1 = f1_score(y_true, y_pred, zero_division=0)

        print("\nAnomaly Detection Performance:")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")

    # Evaluate manipulation detection
    if 'predicted_manipulation' in data.columns and 'potential_manipulation' in data.columns:
        # Compare predictions against our synthetic "ground truth"
        y_true = data['potential_manipulation']
        y_pred = data['predicted_manipulation']

        precision = precision_score(y_true, y_pred, zero_division=0)
        recall = recall_score(y_true, y_pred, zero_division=0)
        f1 = f1_score(y_true, y_pred, zero_division=0)

        print("\nManipulation Detection Performance:")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")

        # Calculate accuracy
        accuracy = (y_true == y_pred).mean()
        print(f"Accuracy: {accuracy:.4f}")

        # Return the overall accuracy
        return accuracy

    return None

In [None]:
# import libraies
import pandas as pd
import numpy as np
import requests
import time
import json
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import re
from sklearn.ensemble import IsolationForest, RandomForestClassifier
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
from sklearn.metrics import precision_score, recall_score, f1_score
import tweepy
import newspaper
from newspaper import Article
import warnings
warnings.filterwarnings('ignore')

# installing sentiment analyzer model
nltk.download('vader_lexicon', quiet=True)
# initializing this instance
sia = SentimentIntensityAnalyzer()

# Historical Stock Data Setup
ALPHA_VANTAGE_API_KEY = "PIG3WPABVKTBMH6Y"

# Twitter API setup
TWITTER_API_KEY = "ZkBtakhypMnFkI4dUzVo0QJTw"
TWITTER_API_SECRET = "P1gqWUJsOFkjmkOsPuyB458xi8bwo4KZ1Cy0LbGXPQLxcR3v79"
TWITTER_ACCESS_TOKEN = "1916142259252432899-t6yhBBktXrrexsMqb0DOR9ZgVDJJTB"
TWITTER_ACCESS_SECRET = "L29j1XSBBYhJ614Ev2cE7Ukl3a0Vs5iwMxC9iyOEOJ3oR"

# Authenticate with Twitter API
auth = tweepy.OAuthHandler(TWITTER_API_KEY, TWITTER_API_SECRET)
auth.set_access_token(TWITTER_ACCESS_TOKEN, TWITTER_ACCESS_SECRET)
twitter_api = tweepy.API(auth)

# Class for stock manipulation detection
class StockManipulationDetector:
    def __init__(self, ticker_symbol, lookback_days=30):
        self.ticker = ticker_symbol
        self.lookback_days = lookback_days
        self.stock_data = None
        self.tweets = None
        self.news = None
        self.anomaly_model = None
        self.manipulation_model = None
        self.scaler = StandardScaler()

    def fetch_stock_data(self):
        """Fetch historical stock data from Alpha Vantage"""
        print(f"Fetching stock data for {self.ticker}...")

        # Daily data
        url = f"https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol={self.ticker}&outputsize=full&apikey={ALPHA_VANTAGE_API_KEY}"
        response = requests.get(url)
        data = response.json()

        if "Time Series (Daily)" not in data:
            print("Error fetching stock data. API response:", data)
            return False

        df = pd.DataFrame(data["Time Series (Daily)"]).T
        df.columns = ["open", "high", "low", "close", "volume"]
        df = df.astype(float)
        df.index = pd.to_datetime(df.index)
        df = df.sort_index()

        # Calculate additional features
        df['price_change'] = df['close'].pct_change()
        df['volume_change'] = df['volume'].pct_change()
        df['high_low_diff'] = df['high'] - df['low']
        df['volatility'] = df['price_change'].rolling(window=5).std()

        # Rolling statistics
        df['price_mean_5d'] = df['close'].rolling(window=5).mean()
        df['volume_mean_5d'] = df['volume'].rolling(window=5).mean()
        df['price_std_5d'] = df['close'].rolling(window=5).std()
        df['volume_std_5d'] = df['volume'].rolling(window=5).std()

        # Z-scores for anomaly detection
        df['price_z_score'] = (df['close'] - df['price_mean_5d']) / df['price_std_5d']
        df['volume_z_score'] = (df['volume'] - df['volume_mean_5d']) / df['volume_std_5d']

        # Momentum indicators
        df['price_momentum'] = df['close'] - df['close'].shift(5)
        df['volume_momentum'] = df['volume'] - df['volume'].shift(5)

        # Filter to relevant period and drop NAs
        df = df.iloc[-self.lookback_days*2:]
        df = df.fillna(0)

        self.stock_data = df
        print(f"Fetched {len(df)} days of stock data")
        return True

    # Simplified Twitter sentiment collection using text files or CSV instead of API
    def fetch_tweets_alternative(self, ticker):
        """Alternative method when Twitter API is unavailable"""
        print(f"Using alternative sentiment data for {ticker}...")

        # Create synthetic sentiment data based on stock price movements
        # This is a fallback when real Twitter data isn't available
        if self.stock_data is not None:
            dates = self.stock_data.index[-30:]  # Last 30 days

            # Create synthetic tweet sentiment that somewhat follows price changes
            # but with some randomness and lag
            price_changes = self.stock_data['price_change'].values[-32:-2]  # Lagged by 2 days

            synthetic_tweet_data = []

            for i, date in enumerate(dates):
                # Base sentiment on lagged price changes with noise
                base_sentiment = price_changes[i] * 5  # Scale up for sentiment range
                sentiment = min(max(base_sentiment + np.random.normal(0, 0.3), -1), 1)  # Bound between -1 and 1

                # Create more tweets on volatile days
                tweet_count = int(50 + abs(sentiment) * 200 + np.random.normal(0, 20))
                tweet_count = max(10, tweet_count)  # At least 10 tweets

                synthetic_tweet_data.append({
                    'date': date,
                    'tweet_sentiment_mean': sentiment,
                    'tweet_sentiment_std': 0.3 + abs(sentiment) * 0.2,
                    'tweet_count': tweet_count,
                    'retweet_count': tweet_count * 3,
                    'favorite_count': tweet_count * 5
                })

            self.tweets = pd.DataFrame(synthetic_tweet_data)
            self.tweets.set_index('date', inplace=True)
            print(f"Created synthetic sentiment data for {ticker}")
            return True

        return False

    # Simplified news collection using text files or CSV instead of web scraping
    def fetch_news_alternative(self, ticker):
        """Alternative method when news scraping is blocked"""
        print(f"Using alternative news data for {ticker}...")

        # Create synthetic news data based on stock movements
        if self.stock_data is not None:
            dates = self.stock_data.index[-30:]  # Last 30 days

            # Create news sentiment that somewhat follows price trends
            # but with occasional contrarian articles
            synthetic_news_data = []

            for date in dates:
                # Get price data for this date if available
                if date in self.stock_data.index:
                    price_change = self.stock_data.loc[date, 'price_change']

                    # Occasionally have contrarian news
                    contrarian = np.random.random() > 0.7

                    if contrarian:
                        # News sentiment opposite to price movement
                        sentiment = -price_change * 3
                    else:
                        # News sentiment aligned with price movement
                        sentiment = price_change * 3

                    sentiment = min(max(sentiment + np.random.normal(0, 0.2), -1), 1)

                    # More news on days with bigger price moves
                    news_count = int(2 + abs(price_change) * 20 + np.random.normal(0, 1))
                    news_count = max(1, news_count)  # At least 1 news item

                    synthetic_news_data.append({
                        'date': date,
                        'news_sentiment_mean': sentiment,
                        'news_sentiment_std': 0.2,
                        'news_count': news_count
                    })

            self.news = pd.DataFrame(synthetic_news_data)
            self.news.set_index('date', inplace=True)
            print(f"Created synthetic news data for {ticker}")
            return True

        return False

    def analyze_sentiment(self):
        """Analyze sentiment from tweets and news, aggregate by day"""
        print("Analyzing sentiment data...")

        # Create date ranges for the period we're analyzing
        end_date = datetime.now().date()
        start_date = end_date - timedelta(days=self.lookback_days)
        date_range = pd.date_range(start=start_date, end=end_date)

        # Initialize sentiment DataFrames
        sentiment_daily = pd.DataFrame(index=date_range)

        # Process tweets sentiment
        if self.tweets is not None and not self.tweets.empty:
            # Convert to datetime and extract date
            self.tweets['date'] = self.tweets['created_at'].dt.date

            # Group by date and calculate metrics
            tweet_sentiment = self.tweets.groupby('date').agg({
                'sentiment': ['mean', 'std', 'count'],
                'retweet_count': 'sum',
                'favorite_count': 'sum'
            })

            tweet_sentiment.columns = ['tweet_sentiment_mean', 'tweet_sentiment_std',
                                     'tweet_count', 'retweet_count', 'favorite_count']

            # Convert index to datetime for proper joining
            tweet_sentiment.index = pd.to_datetime(tweet_sentiment.index)

            # Join with main sentiment DataFrame
            sentiment_daily = sentiment_daily.join(tweet_sentiment)

        # Process news sentiment
        if self.news is not None and not self.news.empty:
            # Convert to datetime and extract date
            self.news['date'] = self.news['published_date'].dt.date

            # Group by date and calculate metrics
            news_sentiment = self.news.groupby('date').agg({
                'sentiment': ['mean', 'std', 'count']
            })

            news_sentiment.columns = ['news_sentiment_mean', 'news_sentiment_std', 'news_count']

            # Convert index to datetime for proper joining
            news_sentiment.index = pd.to_datetime(news_sentiment.index)

            # Join with main sentiment DataFrame
            sentiment_daily = sentiment_daily.join(news_sentiment)

        # Fill NaN values with 0 for calculation purposes
        sentiment_daily = sentiment_daily.fillna(0)

        # Calculate additional metrics
        if 'tweet_sentiment_mean' in sentiment_daily.columns:
            sentiment_daily['tweet_sentiment_zscore'] = (
                sentiment_daily['tweet_sentiment_mean'] -
                sentiment_daily['tweet_sentiment_mean'].rolling(window=5).mean()
            ) / sentiment_daily['tweet_sentiment_mean'].rolling(window=5).std().replace(0, 1)

        if 'news_sentiment_mean' in sentiment_daily.columns:
            sentiment_daily['news_sentiment_zscore'] = (
                sentiment_daily['news_sentiment_mean'] -
                sentiment_daily['news_sentiment_mean'].rolling(window=5).mean()
            ) / sentiment_daily['news_sentiment_mean'].rolling(window=5).std().replace(0, 1)

        # Add sentiment momentum (change from previous day)
        for col in ['tweet_sentiment_mean', 'news_sentiment_mean']:
            if col in sentiment_daily.columns:
                sentiment_daily[f'{col}_change'] = sentiment_daily[col].diff()

        # Add volume change metrics
        for col in ['tweet_count', 'news_count']:
            if col in sentiment_daily.columns:
                sentiment_daily[f'{col}_change'] = sentiment_daily[col].pct_change()
                sentiment_daily[f'{col}_zscore'] = (
                    sentiment_daily[col] -
                    sentiment_daily[col].rolling(window=5).mean()
                ) / sentiment_daily[col].rolling(window=5).std().replace(0, 1)

        return sentiment_daily

    def integrate_data(self):
        """Integrate stock data with sentiment analysis"""
        print("Integrating market and sentiment data...")

        # Get sentiment data
        sentiment_daily = self.analyze_sentiment()

        # Make sure stock_data index is datetime
        self.stock_data.index = pd.to_datetime(self.stock_data.index)

        # Merge sentiment with stock data
        merged_data = self.stock_data.join(sentiment_daily, how='left')

        # Fill missing values
        merged_data = merged_data.fillna(0)

        # Calculate correlations between sentiment and price/volume changes
        # These correlations can help identify manipulation
        if 'tweet_sentiment_mean' in merged_data.columns:
            merged_data['tweet_price_corr'] = merged_data['tweet_sentiment_mean'].rolling(window=5).corr(merged_data['price_change'])
            merged_data['tweet_volume_corr'] = merged_data['tweet_sentiment_mean'].rolling(window=5).corr(merged_data['volume_change'])

        if 'news_sentiment_mean' in merged_data.columns:
            merged_data['news_price_corr'] = merged_data['news_sentiment_mean'].rolling(window=5).corr(merged_data['price_change'])
            merged_data['news_volume_corr'] = merged_data['news_sentiment_mean'].rolling(window=5).corr(merged_data['volume_change'])

        # Add features that might indicate manipulation
        # 1. Abnormal price changes with high sentiment but low news (pump)
        if 'tweet_count' in merged_data.columns and 'news_count' in merged_data.columns:
            merged_data['pump_indicator'] = (
                (merged_data['price_z_score'] > 1.5) &
                (merged_data['tweet_sentiment_zscore'] > 1.5) &
                (merged_data['news_count'] < merged_data['news_count'].mean())
            ).astype(int)

        # 2. High volume with negative sentiment divergence (dump)
        if 'tweet_sentiment_zscore' in merged_data.columns:
            merged_data['dump_indicator'] = (
                (merged_data['volume_z_score'] > 1.5) &
                (merged_data['price_change'] < 0) &
                (merged_data['tweet_sentiment_zscore'] < -1.5)
            ).astype(int)

        # Keep only the most recent lookback days
        merged_data = merged_data.iloc[-self.lookback_days:]

        return merged_data

    def train_anomaly_model(self, data):
        """Train isolation forest model for anomaly detection"""
        print("Training anomaly detection model...")

        # Select features for anomaly detection
        feature_cols = [
            'price_z_score', 'volume_z_score', 'volatility',
            'high_low_diff', 'price_momentum', 'volume_momentum'
        ]

        # Add sentiment features if available
        sentiment_features = [
            'tweet_sentiment_zscore', 'news_sentiment_zscore',
            'tweet_count_zscore', 'news_count_zscore',
            'tweet_price_corr', 'news_price_corr'
        ]

        for feature in sentiment_features:
            if feature in data.columns:
                feature_cols.append(feature)

        # Get feature subset that exists in the data
        valid_features = [col for col in feature_cols if col in data.columns]

        if not valid_features:
            print("No valid features found for anomaly detection")
            return False

        # Extract features
        X = data[valid_features].fillna(0)

        # Scale features
        X_scaled = self.scaler.fit_transform(X)

        # Train isolation forest
        self.anomaly_model = IsolationForest(
            n_estimators=100,
            contamination=0.05,
            random_state=42
        )

        self.anomaly_model.fit(X_scaled)

        # Add anomaly scores to the data
        data['anomaly_score'] = self.anomaly_model.decision_function(X_scaled)
        data['is_anomaly'] = self.anomaly_model.predict(X_scaled)

        # Convert prediction to binary (1 for normal, -1 for anomaly)
        data['is_anomaly'] = (data['is_anomaly'] == -1).astype(int)

        return data

    def train_manipulation_model(self, data):
        """Train XGBoost model to classify potential manipulation"""
        print("Training manipulation detection model...")

        # For a real system, you would have labeled data of known manipulation cases
        # Since we don't have labels, we'll create synthetic ones based on our indicators
        # ... existing code ...

        # Define manipulation as days with anomalies and either pump or dump indicators
        if 'pump_indicator' in data.columns and 'dump_indicator' in data.columns:
            data['potential_manipulation'] = (
                (data['is_anomaly'] == 1) &
                ((data['pump_indicator'] == 1) | (data['dump_indicator'] == 1))
            ).astype(int)
        else:
            # Fallback to just anomalies if we don't have the indicators
            data['potential_manipulation'] = data['is_anomaly']

        # Select features for the classifier
        feature_cols = [
            'price_z_score', 'volume_z_score', 'volatility',
            'high_low_diff', 'price_momentum', 'volume_momentum'
        ]

        # Add sentiment features if available
        sentiment_features = [
            'tweet_sentiment_mean', 'news_sentiment_mean',
            'tweet_count', 'news_count',
            'tweet_sentiment_zscore', 'news_sentiment_zscore',
            'tweet_price_corr', 'news_price_corr'
        ]

        for feature in sentiment_features:
            if feature in data.columns:
                feature_cols.append(feature)

        # Get feature subset that exists in the data
        valid_features = [col for col in feature_cols if col in data.columns]

        if not valid_features:
            print("No valid features found for manipulation model")
            return data

        # Extract features and target
        X = data[valid_features].fillna(0)
        y = data['potential_manipulation']

        # Scale features
        X_scaled = self.scaler.fit_transform(X)

        # Train XGBoost model
        self.manipulation_model = xgb.XGBClassifier(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=3,
            random_state=42,
            use_label_encoder=False,
            eval_metric='logloss'
        )

        try:
            # Only train if we have both classes represented
            if len(y.unique()) > 1:
                # Split data into train and test sets
                from sklearn.model_selection import train_test_split
                X_train, X_test, y_train, y_test = train_test_split(
                    X_scaled, y, test_size=0.2, random_state=42
                )

                # Train model
                self.manipulation_model.fit(X_train, y_train)

                # Make predictions on test set
                y_pred = self.manipulation_model.predict(X_test)
                y_pred_proba = self.manipulation_model.predict_proba(X_test)[:, 1]

                # Calculate metrics
                precision = precision_score(y_test, y_pred)
                recall = recall_score(y_test, y_pred)
                f1 = f1_score(y_test, y_pred)

                print("\n----- Model Performance Metrics -----")
                print(f"Precision: {precision:.3f}")
                print(f"Recall: {recall:.3f}")
                print(f"F1 Score: {f1:.3f}")

                # Add predictions to the data
                data['manipulation_probability'] = self.manipulation_model.predict_proba(X_scaled)[:, 1]
                data['predicted_manipulation'] = self.manipulation_model.predict(X_scaled)

                # Feature importance
                feature_importance = pd.DataFrame({
                    'feature': valid_features,
                    'importance': self.manipulation_model.feature_importances_
                }).sort_values('importance', ascending=False)

                print("\nTop manipulation indicators:")
                print(feature_importance.head(5))

                # Plot ROC curve
                from sklearn.metrics import roc_curve, auc
                fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
                roc_auc = auc(fpr, tpr)

                plt.figure(figsize=(8, 6))
                plt.plot(fpr, tpr, color='darkorange', lw=2,
                        label=f'ROC curve (AUC = {roc_auc:.2f})')
                plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
                plt.xlim([0.0, 1.0])
                plt.ylim([0.0, 1.05])
                plt.xlabel('False Positive Rate')
                plt.ylabel('True Positive Rate')
                plt.title('Receiver Operating Characteristic (ROC) Curve')
                plt.legend(loc="lower right")
                plt.show()

            else:
                print("Not enough variation in the target variable to train classifier")
                data['manipulation_probability'] = 0
                data['predicted_manipulation'] = 0

        except Exception as e:
            print(f"Error training manipulation model: {e}")
            data['manipulation_probability'] = 0
            data['predicted_manipulation'] = 0

        return data


    def detect_manipulation(self, demo_mode=True):
        """Main method to run the entire detection pipeline"""
        # Fetch data
        if not self.fetch_stock_data():
            print("Failed to fetch stock data. Aborting.")
            return None

        if demo_mode:
            # Use alternative data sources that don't require APIs
            self.fetch_tweets_alternative(self.ticker)
            self.fetch_news_alternative(self.ticker)
        else:
            # Try to use actual APIs (may fail with current limitations)
            self.fetch_tweets()
            self.fetch_news()

    # Rest of the method remains the same...

    def display_results(self, data):
        """Display detection results and visualizations"""
        if data is None or data.empty:
            print("No data available to display results")
            return

        # Print summary of detected manipulations
        print("\n----- MANIPULATION DETECTION SUMMARY -----")

        # Filter to just the most recent period
        recent_data = data.iloc[-self.lookback_days:]

        # Count days with potential manipulation
        if 'predicted_manipulation' in recent_data.columns:
            manipulation_days = recent_data[recent_data['predicted_manipulation'] == 1]
            n_manipulation_days = len(manipulation_days)

            print(f"Detected potential manipulation on {n_manipulation_days} days out of {len(recent_data)} analyzed.")

            if n_manipulation_days > 0:
                print("\nDates with suspected manipulation:")
                for date, row in manipulation_days.iterrows():
                    features = []

                    # Add indicators that triggered the alert
                    if row['price_z_score'] > 1.5:
                        features.append(f"Abnormal price (z={row['price_z_score']:.2f})")
                    if row['volume_z_score'] > 1.5:
                        features.append(f"Abnormal volume (z={row['volume_z_score']:.2f})")
                    if 'tweet_sentiment_zscore' in row and row['tweet_sentiment_zscore'] > 1.5:
                        features.append(f"Abnormal social sentiment (z={row['tweet_sentiment_zscore']:.2f})")
                    if 'pump_indicator' in row and row['pump_indicator'] == 1:
                        features.append("Pump pattern")
                    if 'dump_indicator' in row and row['dump_indicator'] == 1:
                        features.append("Dump pattern")

                    print(f"  {date.date()}: {', '.join(features)}")
        else:
            print("Manipulation classification not available.")

        # Plot results
        try:
            self.plot_results(data)
        except Exception as e:
            print(f"Error creating plots: {e}")

    def plot_results(self, data):
        """Create visualizations of the detection results"""
        fig, axes = plt.subplots(3, 1, figsize=(14, 18), sharex=True)

        # Stock price with anomaly highlighting
        ax0 = axes[0]
        ax0.set_title(f"{self.ticker} Stock Price with Anomaly Detection", fontsize=14)
        ax0.plot(data.index, data['close'], label='Close Price', color='blue')

        # Highlight anomalies if available
        if 'is_anomaly' in data.columns:
            anomaly_days = data[data['is_anomaly'] == 1]
            ax0.scatter(anomaly_days.index, anomaly_days['close'],
                      color='red', label='Anomalies', zorder=5)

        # Highlight manipulation if available
        if 'predicted_manipulation' in data.columns:
            manip_days = data[data['predicted_manipulation'] == 1]
            ax0.scatter(manip_days.index, manip_days['close'],
                      color='darkred', marker='X', s=100,
                      label='Potential Manipulation', zorder=10)

        ax0.set_ylabel('Price ($)')
        ax0.legend()
        ax0.grid(True, alpha=0.3)

        # Volume plot
        ax1 = axes[1]
        ax1.set_title(f"{self.ticker} Trading Volume", fontsize=14)
        ax1.bar(data.index, data['volume'], color='green', alpha=0.7, label='Volume')

        # Highlight volume anomalies
        volume_anomalies = data[data['volume_z_score'] > 1.5]
        ax1.bar(volume_anomalies.index, volume_anomalies['volume'], color='orange', label='Volume Anomalies')

        ax1.set_ylabel('Volume')
        ax1.legend()
        ax1.grid(True, alpha=0.3)

        # Sentiment plot
        ax2 = axes[2]
        ax2.set_title("Sentiment Analysis", fontsize=14)

        if 'tweet_sentiment_mean' in data.columns:
            ax2.plot(data.index, data['tweet_sentiment_mean'],
                   label='Social Sentiment', color='purple')

        if 'news_sentiment_mean' in data.columns:
            ax2.plot(data.index, data['news_sentiment_mean'],
                   label='News Sentiment', color='brown')

        # Add zero line
        ax2.axhline(y=0, color='gray', linestyle='-', alpha=0.5)

        ax2.set_ylabel('Sentiment Score')
        ax2.set_xlabel('Date')
        ax2.legend()
        ax2.grid(True, alpha=0.3)

        plt.tight_layout()
        plt.savefig(f"{self.ticker}_manipulation_analysis.png")
        plt.close()

        print(f"\nSaved visualization to {self.ticker}_manipulation_analysis.png")

def test_detector(ticker_symbol, demo_mode=True):
    print(f"\n===== ANALYZING {ticker_symbol} =====")
    detector = StockManipulationDetector(ticker_symbol, lookback_days=30)
    results = detector.detect_manipulation(demo_mode=demo_mode)
    return results

# Main function
def main():
    # Test with a few stocks known for volatility and social media attention
    tickers = ["GME", "AMC", "TSLA", "AAPL"]

    for ticker in tickers:
        test_detector(ticker)
        # Add delay between API calls to avoid rate limits
        time.sleep(2)

if __name__ == "__main__":
    main()


===== ANALYZING GME =====
Fetching stock data for GME...
Error fetching stock data. API response: {'Information': 'We have detected your API key as PIG3WPABVKTBMH6Y and our standard API rate limit is 25 requests per day. Please subscribe to any of the premium plans at https://www.alphavantage.co/premium/ to instantly remove all daily rate limits.'}
Failed to fetch stock data. Aborting.

===== ANALYZING AMC =====
Fetching stock data for AMC...
Error fetching stock data. API response: {'Information': 'We have detected your API key as PIG3WPABVKTBMH6Y and our standard API rate limit is 25 requests per day. Please subscribe to any of the premium plans at https://www.alphavantage.co/premium/ to instantly remove all daily rate limits.'}
Failed to fetch stock data. Aborting.

===== ANALYZING TSLA =====
Fetching stock data for TSLA...
Error fetching stock data. API response: {'Information': 'We have detected your API key as PIG3WPABVKTBMH6Y and our standard API rate limit is 25 requests per d

In [12]:
# import libraies
import pandas as pd
import numpy as np
import requests
import time
import json
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import re
from sklearn.ensemble import IsolationForest, RandomForestClassifier
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
from sklearn.metrics import precision_score, recall_score, f1_score
import tweepy
import newspaper
from newspaper import Article
import warnings
warnings.filterwarnings('ignore')

# installing sentiment analyzer model
nltk.download('vader_lexicon', quiet=True)
# initializing this instance
sia = SentimentIntensityAnalyzer()

# Historical Stock Data Setup
ALPHA_VANTAGE_API_KEY = "PIG3WPABVKTBMH6Y"

# Twitter API setup
TWITTER_API_KEY = "ZkBtakhypMnFkI4dUzVo0QJTw"
TWITTER_API_SECRET = "P1gqWUJsOFkjmkOsPuyB458xi8bwo4KZ1Cy0LbGXPQLxcR3v79"
TWITTER_ACCESS_TOKEN = "1916142259252432899-t6yhBBktXrrexsMqb0DOR9ZgVDJJTB"
TWITTER_ACCESS_SECRET = "L29j1XSBBYhJ614Ev2cE7Ukl3a0Vs5iwMxC9iyOEOJ3oR"

# Authenticate with Twitter API
auth = tweepy.OAuthHandler(TWITTER_API_KEY, TWITTER_API_SECRET)
auth.set_access_token(TWITTER_ACCESS_TOKEN, TWITTER_ACCESS_SECRET)
twitter_api = tweepy.API(auth)

# Class for stock manipulation detection
class StockManipulationDetector:
    def __init__(self, ticker_symbol, lookback_days=30):
        self.ticker = ticker_symbol
        self.lookback_days = lookback_days
        self.stock_data = None
        self.tweets = None
        self.news = None
        self.anomaly_model = None
        self.manipulation_model = None
        self.scaler = StandardScaler()

    def fetch_stock_data(self):
        """Fetch historical stock data from Alpha Vantage"""
        print(f"Fetching stock data for {self.ticker}...")

        # Daily data
        url = f"https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol={self.ticker}&outputsize=full&apikey={ALPHA_VANTAGE_API_KEY}"
        response = requests.get(url)
        data = response.json()

        if "Time Series (Daily)" not in data:
            print("Error fetching stock data. API response:", data)
            return False

        df = pd.DataFrame(data["Time Series (Daily)"]).T
        df.columns = ["open", "high", "low", "close", "volume"]
        df = df.astype(float)
        df.index = pd.to_datetime(df.index)
        df = df.sort_index()

        # Calculate additional features
        df['price_change'] = df['close'].pct_change()
        df['volume_change'] = df['volume'].pct_change()
        df['high_low_diff'] = df['high'] - df['low']
        df['volatility'] = df['price_change'].rolling(window=5).std()

        # Rolling statistics
        df['price_mean_5d'] = df['close'].rolling(window=5).mean()
        df['volume_mean_5d'] = df['volume'].rolling(window=5).mean()
        df['price_std_5d'] = df['close'].rolling(window=5).std()
        df['volume_std_5d'] = df['volume'].rolling(window=5).std()

        # Z-scores for anomaly detection
        df['price_z_score'] = (df['close'] - df['price_mean_5d']) / df['price_std_5d']
        df['volume_z_score'] = (df['volume'] - df['volume_mean_5d']) / df['volume_std_5d']

        # Momentum indicators
        df['price_momentum'] = df['close'] - df['close'].shift(5)
        df['volume_momentum'] = df['volume'] - df['volume'].shift(5)

        # Filter to relevant period and drop NAs
        df = df.iloc[-self.lookback_days*2:]
        df = df.fillna(0)

        self.stock_data = df
        print(f"Fetched {len(df)} days of stock data")
        return True

    # Simplified Twitter sentiment collection using text files or CSV instead of API
    def fetch_tweets_alternative(self, ticker):
        """Alternative method when Twitter API is unavailable"""
        print(f"Using alternative sentiment data for {ticker}...")

        # Create synthetic sentiment data based on stock price movements
        # This is a fallback when real Twitter data isn't available
        if self.stock_data is not None:
            dates = self.stock_data.index[-30:]  # Last 30 days

            # Create synthetic tweet sentiment that somewhat follows price changes
            # but with some randomness and lag
            price_changes = self.stock_data['price_change'].values[-32:-2]  # Lagged by 2 days

            synthetic_tweet_data = []

            for i, date in enumerate(dates):
                # Base sentiment on lagged price changes with noise
                base_sentiment = price_changes[i] * 5  # Scale up for sentiment range
                sentiment = min(max(base_sentiment + np.random.normal(0, 0.3), -1), 1)  # Bound between -1 and 1

                # Create more tweets on volatile days
                tweet_count = int(50 + abs(sentiment) * 200 + np.random.normal(0, 20))
                tweet_count = max(10, tweet_count)  # At least 10 tweets

                synthetic_tweet_data.append({
                    'date': date,
                    'tweet_sentiment_mean': sentiment,
                    'tweet_sentiment_std': 0.3 + abs(sentiment) * 0.2,
                    'tweet_count': tweet_count,
                    'retweet_count': tweet_count * 3,
                    'favorite_count': tweet_count * 5
                })

            self.tweets = pd.DataFrame(synthetic_tweet_data)
            self.tweets.set_index('date', inplace=True)
            print(f"Created synthetic sentiment data for {ticker}")
            return True

        return False

    # Simplified news collection using text files or CSV instead of web scraping
    def fetch_news_alternative(self, ticker):
        """Alternative method when news scraping is blocked"""
        print(f"Using alternative news data for {ticker}...")

        # Create synthetic news data based on stock movements
        if self.stock_data is not None:
            dates = self.stock_data.index[-30:]  # Last 30 days

            # Create news sentiment that somewhat follows price trends
            # but with occasional contrarian articles
            synthetic_news_data = []

            for date in dates:
                # Get price data for this date if available
                if date in self.stock_data.index:
                    price_change = self.stock_data.loc[date, 'price_change']

                    # Occasionally have contrarian news
                    contrarian = np.random.random() > 0.7

                    if contrarian:
                        # News sentiment opposite to price movement
                        sentiment = -price_change * 3
                    else:
                        # News sentiment aligned with price movement
                        sentiment = price_change * 3

                    sentiment = min(max(sentiment + np.random.normal(0, 0.2), -1), 1)

                    # More news on days with bigger price moves
                    news_count = int(2 + abs(price_change) * 20 + np.random.normal(0, 1))
                    news_count = max(1, news_count)  # At least 1 news item

                    synthetic_news_data.append({
                        'date': date,
                        'news_sentiment_mean': sentiment,
                        'news_sentiment_std': 0.2,
                        'news_count': news_count
                    })

            self.news = pd.DataFrame(synthetic_news_data)
            self.news.set_index('date', inplace=True)
            print(f"Created synthetic news data for {ticker}")
            return True

        return False

    def analyze_sentiment(self):
        """Analyze 7sentiment from tweets and news, aggregate by day"""
        print("Analyzing sentiment data...")

        # Create date ranges for the period we're analyzing
        end_date = datetime.now().date()
        start_date = end_date - timedelta(days=self.lookback_days)
        date_range = pd.date_range(start=start_date, end=end_date)

        # Initialize sentiment DataFrames
        sentiment_daily = pd.DataFrame(index=date_range)

        # Process tweets sentiment
        if self.tweets is not None and not self.tweets.empty:
            # The 'date' column is already present in self.tweets DataFrame
            # and is used as the index.
            # self.tweets['date'] = self.tweets['created_at'].dt.date

            # Group by date and calculate metrics
            tweet_sentiment = self.tweets.groupby(self.tweets.index).agg({
                'tweet_sentiment_mean': 'mean',
                'tweet_sentiment_std': 'std',
                'tweet_count': 'sum',
                'retweet_count': 'sum',
                'favorite_count': 'sum'
            })

            # tweet_sentiment.columns = ['tweet_sentiment_mean', 'tweet_sentiment_std',
            #                          'tweet_count', 'retweet_count', 'favorite_count']

            # Convert index to datetime for proper joining
            # tweet_sentiment.index = pd.to_datetime(tweet_sentiment.index)

            # Join with main sentiment DataFrame
            sentiment_daily = sentiment_daily.join(tweet_sentiment)

        # Process news sentiment
        if self.news is not None and not self.news.empty:
            # Convert to datetime and extract date
            # The news dataframe already has a 'date' column,
            # just converting to datetime
            self.news = self.news.reset_index()
            self.news['date'] = pd.to_datetime(self.news['date']).dt.date
            #self.news['date'] = self.news['published_date'].dt.date

            # Group by date and calculate metrics
            news_sentiment = self.news.groupby('date').agg({
                'news_sentiment_mean': ['mean', 'std'], #changed 'sentiment' to actual columns
                'news_count': 'sum'                      #changed 'sentiment' to actual columns
            })

            news_sentiment.columns = ['news_sentiment_mean', 'news_sentiment_std', 'news_count']

            # Convert index to datetime for proper joining
            news_sentiment.index = pd.to_datetime(news_sentiment.index)

            # Join with main sentiment DataFrame
            sentiment_daily = sentiment_daily.join(news_sentiment)

        # Fill NaN values with 0 for calculation purposes
        sentiment_daily = sentiment_daily.fillna(0)

        # Calculate additional metrics
        if 'tweet_sentiment_mean' in sentiment_daily.columns:
            sentiment_daily['tweet_sentiment_zscore'] = (
                sentiment_daily['tweet_sentiment_mean'] -
                sentiment_daily['tweet_sentiment_mean'].rolling(window=5).mean()
            ) / sentiment_daily['tweet_sentiment_mean'].rolling(window=5).std().replace(0, 1)

        if 'news_sentiment_mean' in sentiment_daily.columns:
            sentiment_daily['news_sentiment_zscore'] = (
                sentiment_daily['news_sentiment_mean'] -
                sentiment_daily['news_sentiment_mean'].rolling(window=5).mean()
            ) / sentiment_daily['news_sentiment_mean'].rolling(window=5).std().replace(0, 1)

        # Add sentiment momentum (change from previous day)
        for col in ['tweet_sentiment_mean', 'news_sentiment_mean']:
            if col in sentiment_daily.columns:
                sentiment_daily[f'{col}_change'] = sentiment_daily[col].diff()

        # Add volume change metrics
        for col in ['tweet_count', 'news_count']:
            if col in sentiment_daily.columns:
                sentiment_daily[f'{col}_change'] = sentiment_daily[col].pct_change()
                sentiment_daily[f'{col}_zscore'] = (
                    sentiment_daily[col] -
                    sentiment_daily[col].rolling(window=5).mean()
                ) / sentiment_daily[col].rolling(window=5).std().replace(0, 1)

        return sentiment_daily

    def integrate_data(self):
        """Integrate stock data with sentiment analysis"""
        print("Integrating market and sentiment data...")

        sentiment_daily = self.analyze_sentiment()

        self.stock_data.index = pd.to_datetime(self.stock_data.index)

        merged_data = self.stock_data.join(sentiment_daily, how='left')

        merged_data = merged_data.fillna(0)

        if 'tweet_sentiment_mean' in merged_data.columns:
            merged_data['tweet_price_corr'] = merged_data['tweet_sentiment_mean'].rolling(window=5).corr(merged_data['price_change'])
            merged_data['tweet_volume_corr'] = merged_data['tweet_sentiment_mean'].rolling(window=5).corr(merged_data['volume_change'])

        if 'news_sentiment_mean' in merged_data.columns:
            merged_data['news_price_corr'] = merged_data['news_sentiment_mean'].rolling(window=5).corr(merged_data['price_change'])
            merged_data['news_volume_corr'] = merged_data['news_sentiment_mean'].rolling(window=5).corr(merged_data['volume_change'])

        if 'tweet_count' in merged_data.columns and 'news_count' in merged_data.columns:
            merged_data['pump_indicator'] = (
                (merged_data['price_z_score'] > 1.0) &
                (merged_data['tweet_sentiment_zscore'] > 1.0) &
                (merged_data['news_count'] < merged_data['news_count'].mean())
            ).astype(int)

        if 'tweet_sentiment_zscore' in merged_data.columns:
            merged_data['dump_indicator'] = (
                (merged_data['volume_z_score'] > 1.0) &
                (merged_data['price_change'] < 0) &
                (merged_data['tweet_sentiment_zscore'] < -1.0)
            ).astype(int)

        merged_data = merged_data.iloc[-self.lookback_days:]

        return merged_data

    def train_anomaly_model(self, data):
        """Train isolation forest model for anomaly detection"""
        print("Training anomaly detection model...")

        feature_cols = [
            'price_z_score', 'volume_z_score', 'volatility',
            'high_low_diff', 'price_momentum', 'volume_momentum'
        ]

        sentiment_features = [
            'tweet_sentiment_zscore', 'news_sentiment_zscore',
            'tweet_count_zscore', 'news_count_zscore',
            'tweet_price_corr', 'news_price_corr'
        ]

        for feature in sentiment_features:
            if feature in data.columns:
                feature_cols.append(feature)

        valid_features = [col for col in feature_cols if col in data.columns]

        if not valid_features:
            print("No valid features found for anomaly detection")
            return False

        X = data[valid_features].fillna(0)

        X_scaled = self.scaler.fit_transform(X)

        self.anomaly_model = IsolationForest(
            n_estimators=100,
            contamination=0.05,
            random_state=42
        )

        self.anomaly_model.fit(X_scaled)

        data['anomaly_score'] = self.anomaly_model.decision_function(X_scaled)
        data['is_anomaly'] = self.anomaly_model.predict(X_scaled)

        data['is_anomaly'] = (data['is_anomaly'] == -1).astype(int)

        return data

    def train_manipulation_model(self, data):
        """Train XGBoost model to classify potential manipulation"""

        print("Training manipulation detection model...")
        if 'pump_indicator' in data.columns and 'dump_indicator' in data.columns:
            data['potential_manipulation'] = (
                (data['is_anomaly'] == 1) &
                ((data['pump_indicator'] == 1) | (data['dump_indicator'] == 1))
            ).astype(int)
        else:
            data['potential_manipulation'] = data['is_anomaly']

        feature_cols = [
            'price_z_score', 'volume_z_score', 'volatility',
            'high_low_diff', 'price_momentum', 'volume_momentum'
        ]

        sentiment_features = [
            'tweet_sentiment_mean', 'news_sentiment_mean',
            'tweet_count', 'news_count',
            'tweet_sentiment_zscore', 'news_sentiment_zscore',
            'tweet_price_corr', 'news_price_corr'
        ]

        for feature in sentiment_features:
            if feature in data.columns:
                feature_cols.append(feature)

        valid_features = [col for col in feature_cols if col in data.columns]

        if not valid_features:
            print("No valid features found for manipulation model")
            return data

        X = data[valid_features].fillna(0)
        y = data['potential_manipulation']

        X_scaled = self.scaler.fit_transform(X)

        self.manipulation_model = xgb.XGBClassifier(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=3,
            random_state=42,
            use_label_encoder=False,
            eval_metric='logloss'
        )

        try:
           if len(y.unique()) > 1:
               from sklearn.model_selection import train_test_split
               X_train, X_test, y_train, y_test = train_test_split(
                   X_scaled, y, test_size=0.2, random_state=42
               )
               self.manipulation_model.fit(X_train, y_train)
               y_pred = self.manipulation_model.predict(X_test)
               y_pred_proba = self.manipulation_model.predict_proba(X_test)[:, 1]
               precision = precision_score(y_test, y_pred)
               recall = recall_score(y_test, y_pred)
               f1 = f1_score(y_test, y_pred)
               print("\n----- Model Performance Metrics -----")
               print(f"Precision: {precision:.3f}")
               print(f"Recall: {recall:.3f}")
               print(f"F1 Score: {f1:.3f}")

               data['manipulation_probability'] = self.manipulation_model.predict_proba(X_scaled)[:, 1]
               data['predicted_manipulation'] = self.manipulation_model.predict(X_scaled)

               feature_importance = pd.DataFrame({
                   'feature': valid_features,
                   'importance': self.manipulation_model.feature_importances_
               }).sort_values('importance', ascending=False)
               print("\nTop manipulation indicators:")
               print(feature_importance.head(5))

               from sklearn.metrics import roc_curve, auc
               fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
               roc_auc = auc(fpr, tpr)
               plt.figure(figsize=(8, 6))
               plt.plot(fpr, tpr, color='darkorange', lw=2,
                       label=f'ROC curve (AUC = {roc_auc:.2f})')
               plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
               plt.xlim([0.0, 1.0])
               plt.ylim([0.0, 1.05])
               plt.xlabel('False Positive Rate')
               plt.ylabel('True Positive Rate')
               plt.title('Receiver Operating Characteristic (ROC) Curve')
               plt.legend(loc="lower right")
               plt.show()
           else:
               print("Not enough variation in the target variable to train classifier")
               data['manipulation_probability'] = 0
               data['predicted_manipulation'] = 0
        except Exception as e:
              print(f"Error training manipulation model: {e}")
              data['manipulation_probability'] = 0
              data['predicted_manipulation'] = 0

        return data


    def detect_manipulation(self, demo_mode=True):
        """Main method to run the entire detection pipeline"""
        # Fetch data
        if not self.fetch_stock_data():
            print("Failed to fetch stock data. Aborting.")
            return None

        if demo_mode:
            self.fetch_tweets_alternative(self.ticker)
            self.fetch_news_alternative(self.ticker)
        else:
            self.fetch_tweets()
            self.fetch_news()

        # Add these steps to complete the pipeline
        # 1. Integrate all data sources
        merged_data = self.integrate_data()

        # 2. Train anomaly detection model
        merged_data = self.train_anomaly_model(merged_data)

        # 3. Train manipulation detection model
        merged_data = self.train_manipulation_model(merged_data)

        # 4. Display results
        self.display_results(merged_data)

        return merged_data

    def display_results(self, data):
        """Display detection results and visualizations"""
        if data is None or data.empty:
            print("No data available to display results")
            return

        # Print summary of detected manipulations
        print("\n----- MANIPULATION DETECTION SUMMARY -----")

        # Filter to just the most recent period
        recent_data = data.iloc[-self.lookback_days:]

        # Count days with potential manipulation
        if 'predicted_manipulation' in recent_data.columns:
            manipulation_days = recent_data[recent_data['predicted_manipulation'] == 1]
            n_manipulation_days = len(manipulation_days)

            print(f"Detected potential manipulation on {n_manipulation_days} days out of {len(recent_data)} analyzed.")

            if n_manipulation_days > 0:
                print("\nDates with suspected manipulation:")
                for date, row in manipulation_days.iterrows():
                    features = []

                    # Add indicators that triggered the alert
                    if row['price_z_score'] > 1.5:
                        features.append(f"Abnormal price (z={row['price_z_score']:.2f})")
                    if row['volume_z_score'] > 1.5:
                        features.append(f"Abnormal volume (z={row['volume_z_score']:.2f})")
                    if 'tweet_sentiment_zscore' in row and row['tweet_sentiment_zscore'] > 1.5:
                        features.append(f"Abnormal social sentiment (z={row['tweet_sentiment_zscore']:.2f})")
                    if 'pump_indicator' in row and row['pump_indicator'] == 1:
                        features.append("Pump pattern")
                    if 'dump_indicator' in row and row['dump_indicator'] == 1:
                        features.append("Dump pattern")

                    print(f"  {date.date()}: {', '.join(features)}")
        else:
            print("Manipulation classification not available.")

        # Plot results
            try:
                self.plot_results(data)
            except Exception as e:
                print(f"Error creating plots: {e}")

    def plot_results(self, data):
        """Create visualizations of the detection results"""
        fig, axes = plt.subplots(3, 1, figsize=(14, 18), sharex=True)

        # Stock price with anomaly highlighting
        ax0 = axes[0]
        ax0.set_title(f"{self.ticker} Stock Price with Anomaly Detection", fontsize=14)
        ax0.plot(data.index, data['close'], label='Close Price', color='blue')

        # Highlight anomalies if available
        if 'is_anomaly' in data.columns:
            anomaly_days = data[data['is_anomaly'] == 1]
            ax0.scatter(anomaly_days.index, anomaly_days['close'],
                      color='red', label='Anomalies', zorder=5)

        # Highlight manipulation if available
        if 'predicted_manipulation' in data.columns:
            manip_days = data[data['predicted_manipulation'] == 1]
            ax0.scatter(manip_days.index, manip_days['close'],
                      color='darkred', marker='X', s=100,
                      label='Potential Manipulation', zorder=10)

        ax0.set_ylabel('Price ($)')
        ax0.legend()
        ax0.grid(True, alpha=0.3)

        # Volume plot
        ax1 = axes[1]
        ax1.set_title(f"{self.ticker} Trading Volume", fontsize=14)
        ax1.bar(data.index, data['volume'], color='green', alpha=0.7, label='Volume')

        # Highlight volume anomalies
        volume_anomalies = data[data['volume_z_score'] > 1.5]
        ax1.bar(volume_anomalies.index, volume_anomalies['volume'], color='orange', label='Volume Anomalies')

        ax1.set_ylabel('Volume')
        ax1.legend()
        ax1.grid(True, alpha=0.3)

        # Sentiment plot
        ax2 = axes[2]
        ax2.set_title("Sentiment Analysis", fontsize=14)

        if 'tweet_sentiment_mean' in data.columns:
            ax2.plot(data.index, data['tweet_sentiment_mean'],
                   label='Social Sentiment', color='purple')

        if 'news_sentiment_mean' in data.columns:
            ax2.plot(data.index, data['news_sentiment_mean'],
                   label='News Sentiment', color='brown')

        # Add zero line
        ax2.axhline(y=0, color='gray', linestyle='-', alpha=0.5)

        ax2.set_ylabel('Sentiment Score')
        ax2.set_xlabel('Date')
        ax2.legend()
        ax2.grid(True, alpha=0.3)

        plt.tight_layout()
        plt.savefig(f"{self.ticker}_manipulation_analysis.png")
        plt.close()

        print(f"\nSaved visualization to {self.ticker}_manipulation_analysis.png")

def test_detector(ticker_symbol, demo_mode=True):
    print(f"\n===== ANALYZING {ticker_symbol} =====")
    detector = StockManipulationDetector(ticker_symbol, lookback_days=365)
    results = detector.detect_manipulation(demo_mode=True)
    return results

# Main function
def main():

    tickers = ["NVDA", "AMD", "TSLA", "COIN","GME"]
# - TSLA (Tesla)
# - NVDA (NVIDIA)
# - AMD (Advanced Micro Devices)
# - COIN (Coinbase)
# - GME (GameStop)
    for ticker in tickers:
        test_detector(ticker)
        time.sleep(2)

if __name__ == "__main__":
    main()

AttributeError: partially initialized module 'nltk' has no attribute 'data' (most likely due to a circular import)