# Stock Market Data Exploration

This notebook explores the raw data collected from various sources using the project's data collection and cleaning pipeline:
1. Historical stock price data from Yahoo Finance
2. News data from Alpha Vantage
3. Technical indicators and sentiment analysis
4. Basic statistical analysis and visualization

In [3]:
# Print current working directory and setup paths
import os
from pathlib import Path

# Add SSL certificate fix
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')

# Print current working directory
print("Current working directory:", os.getcwd())

# Create absolute path to config file
project_root = Path.cwd().parent  # Adjust this if needed to point to your project root
config_path = project_root / "config" / "config.yaml"

print("Looking for config file at:", config_path)
print("Config file exists:", config_path.exists())

# Import required modules
import sys
sys.path.append(str(project_root))

from src.data.data_collection import DataCollector
from src.data.data_cleaning import load_config

# Initialize data collector with explicit config path
collector = DataCollector(config_path=str(config_path))

# Pass the config_path to load_config
config = load_config(config_path=str(config_path))

print(f"Analyzing data for symbols: {collector.symbols}")
print(f"Date range: {collector.start_date} to {collector.end_date}")

Current working directory: /Users/andresquast/Desktop/4641/stockMarketClone1125/ML_StockPredictor_Fall2024/notebooks
Looking for config file at: /Users/andresquast/Desktop/4641/stockMarketClone1125/ML_StockPredictor_Fall2024/config/config.yaml
Config file exists: True
Analyzing data for symbols: ['AAPL', 'GOOG', 'MSFT']
Date range: 2020-01-01 to 2024-01-01


[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>


## 1. Initialize Data Collection

In [None]:
# Initialize data collector
collector = DataCollector()  # Use DataCollector class, not data_collection
config = load_config()

print(f"Analyzing data for symbols: {collector.symbols}")
print(f"Date range: {collector.start_date} to {collector.end_date}")
print(f"Technical indicators enabled: {collector.technical_indicators}")
print(f"Sentiment analysis enabled: {collector.sentiment_enabled}")

## 2. Collect and Clean Data

In [None]:
# Collect all data
all_data = collector.collect_all_data()

# Process each symbol's data
processed_data = {}
for symbol in collector.symbols:
    print(f"\nProcessing {symbol} data:")
    
    # Clean stock data
    stock_df = all_data[symbol]['stock_data']
    stock_df['Ticker'] = symbol
    cleaned_stock = clean_stock_data(stock_df, config)
    print(f"Stock data shape: {cleaned_stock.shape}")
    
    # Clean news data if available
    news_df = all_data[symbol]['news_data']
    if not news_df.empty and collector.sentiment_enabled:
        cleaned_news = clean_news_data(news_df, config)
        print(f"News data shape: {cleaned_news.shape}")
    else:
        cleaned_news = None
        print("No news data available")
    
    # Combine data
    processed_data[symbol] = combine_data(cleaned_stock, cleaned_news)

## 3. Analyze Technical Indicators

In [None]:
def analyze_technical_indicators(df, symbol):
    print(f"\nTechnical Analysis for {symbol}:")
    
    # Plot price and technical indicators
    fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(15, 12))
    
    # Price and Moving Averages
    ax1.plot(df.index, df['Close'], label='Close')
    if 'SMA5' in df.columns:
        ax1.plot(df.index, df['SMA5'], label='SMA5')
    if 'SMA20' in df.columns:
        ax1.plot(df.index, df['SMA20'], label='SMA20')
    ax1.set_title(f'{symbol} Price and Moving Averages')
    ax1.legend()
    
    # RSI
    if 'RSI' in df.columns:
        ax2.plot(df.index, df['RSI'], color='purple')
        ax2.axhline(y=70, color='r', linestyle='--')
        ax2.axhline(y=30, color='g', linestyle='--')
        ax2.set_title('RSI')
    
    # MACD
    if 'MACD' in df.columns and 'MACD_Signal' in df.columns:
        ax3.plot(df.index, df['MACD'], label='MACD')
        ax3.plot(df.index, df['MACD_Signal'], label='Signal')
        ax3.set_title('MACD')
        ax3.legend()
    
    plt.tight_layout()
    plt.show()

for symbol, df in processed_data.items():
    analyze_technical_indicators(df, symbol)

## 4. Analyze Sentiment Data

In [None]:
def analyze_sentiment_data(df, symbol):
    if 'avg_sentiment' not in df.columns:
        print(f"No sentiment data available for {symbol}")
        return
        
    print(f"\nSentiment Analysis for {symbol}:")
    
    # Plot sentiment metrics
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))
    
    # Average sentiment over time
    ax1.plot(df.index, df['avg_sentiment'], label='Average Sentiment')
    ax1.fill_between(df.index, 
                     df['avg_sentiment'] - df['sentiment_std'],
                     df['avg_sentiment'] + df['sentiment_std'],
                     alpha=0.2)
    ax1.set_title(f'{symbol} Sentiment Over Time')
    ax1.legend()
    
    # News count
    ax2.bar(df.index, df['news_count'])
    ax2.set_title('Daily News Article Count')
    
    plt.tight_layout()
    plt.show()
    
    # Print summary statistics
    print("\nSentiment Summary Statistics:")
    print(df[['avg_sentiment', 'sentiment_std', 'news_count']].describe())

for symbol, df in processed_data.items():
    analyze_sentiment_data(df, symbol)

## 5. Save Processed Data

In [None]:
# Save processed data
for symbol, df in processed_data.items():
    output_path = f'../data/preprocessed/{symbol}_combined_preprocessed.csv'
    df.to_csv(output_path)
    print(f"Saved processed data for {symbol} to {output_path}")
    
    # Print final dataset info
    print(f"\nFinal dataset info for {symbol}:")
    print(f"Shape: {df.shape}")
    print("\nColumns:")
    print(df.columns.tolist())
    print("\nMissing values:")
    print(df.isnull().sum())