## üìä Examine Downloaded WRDS Data

Let's check the quality and structure of the downloaded CRSP data.

In [5]:
# Load and examine the downloaded CRSP data
import pandas as pd
import numpy as np
from pathlib import Path
import config

print("üîç EXAMINING DOWNLOADED CRSP DATA")
print("=" * 50)

# Check if data file exists
data_file = Path(config.DATA_DIR) / "stock_data_raw.csv"
print(f"üìÅ Looking for data file: {data_file}")

if data_file.exists():
    print("‚úÖ Data file found! Loading...")
    
    # Load the data
    downloaded_data = pd.read_csv(data_file)
    downloaded_data['date'] = pd.to_datetime(downloaded_data['date'])
    
    print(f"\nüìä DATASET OVERVIEW:")
    print(f"   Total observations: {len(downloaded_data):,}")
    print(f"   Unique stocks (PERMNO): {downloaded_data['permno'].nunique():,}")
    print(f"   Date range: {downloaded_data['date'].min().date()} to {downloaded_data['date'].max().date()}")
    print(f"   Columns: {list(downloaded_data.columns)}")
    
    # Check data structure
    print(f"\nüèóÔ∏è DATA STRUCTURE:")
    print(f"   Data shape: {downloaded_data.shape}")
    print(f"   Memory usage: {downloaded_data.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
    
    # Display first few rows
    print(f"\nüìã FIRST 5 ROWS:")
    print(downloaded_data.head())
    
else:
    print("‚ùå Data file not found!")
    print("üí° Run the download script first: python download_wrds_data.py")

üîç EXAMINING DOWNLOADED CRSP DATA
üìÅ Looking for data file: data\stock_data_raw.csv
‚úÖ Data file found! Loading...

üìä DATASET OVERVIEW:
   Total observations: 938,008
   Unique stocks (PERMNO): 10,848
   Date range: 1965-01-29 to 1989-12-29
   Columns: ['date', 'permno', 'ticker', 'ret', 'prc', 'shrout', 'exchcd', 'shrcd', 'vol', 'market_cap']

üèóÔ∏è DATA STRUCTURE:
   Data shape: (938008, 10)
   Memory usage: 118.2 MB

üìã FIRST 5 ROWS:
        date  permno ticker       ret     prc  shrout  exchcd  shrcd     vol  \
0 1986-02-28   10001   GFGC  0.020408  6.2500   985.0       3     11  1067.0   
1 1986-03-31   10001   GFGC  0.025200  6.3125   985.0       3     11   335.0   
2 1986-04-30   10001   GFGC  0.009901  6.3750   985.0       3     11   225.0   
3 1986-05-30   10001   GFGC -0.009804  6.3125   985.0       3     11   217.0   
4 1986-06-30   10001   GFGC -0.013069  6.1250   985.0       3     11   238.0   

   market_cap  
0    6.156250  
1    6.217812  
2    6.279375  
3 

In [6]:
# Data Quality Analysis
print("üî¨ DATA QUALITY ANALYSIS")
print("=" * 40)

# Check for missing values
print("\nüìâ Missing Values:")
missing_counts = downloaded_data.isnull().sum()
for col, count in missing_counts.items():
    if count > 0:
        pct = (count / len(downloaded_data)) * 100
        print(f"   {col}: {count:,} ({pct:.2f}%)")

if missing_counts.sum() == 0:
    print("   ‚úÖ No missing values found!")

# Check data types and ranges
print(f"\nüìä Data Statistics:")
print(f"   Returns - Mean: {downloaded_data['ret'].mean():.4f}, Std: {downloaded_data['ret'].std():.4f}")
print(f"   Returns - Min: {downloaded_data['ret'].min():.4f}, Max: {downloaded_data['ret'].max():.4f}")
print(f"   Prices - Min: ${downloaded_data['prc'].min():.2f}, Max: ${downloaded_data['prc'].max():.2f}")
print(f"   Market Cap - Min: ${downloaded_data['market_cap'].min():.2f}M, Max: ${downloaded_data['market_cap'].max():.2f}M")

# Check exchange distribution
print(f"\nüè¢ Exchange Distribution:")
exchange_dist = downloaded_data['exchcd'].value_counts().sort_index()
for exchcd, count in exchange_dist.items():
    pct = (count / len(downloaded_data)) * 100
    exchange_name = {1: 'NYSE', 2: 'AMEX', 3: 'NASDAQ'}.get(exchcd, f'Exchange {exchcd}')
    print(f"   {exchange_name} ({exchcd}): {count:,} ({pct:.1f}%)")

# Check time coverage
print(f"\nüìÖ Time Coverage:")
monthly_obs = downloaded_data.groupby(downloaded_data['date'].dt.to_period('M')).size()
print(f"   Total months: {len(monthly_obs)}")
print(f"   Avg observations per month: {monthly_obs.mean():.0f}")
print(f"   Min observations per month: {monthly_obs.min()}")
print(f"   Max observations per month: {monthly_obs.max()}")

print(f"\nüéâ DATA QUALITY SUMMARY:")
print(f"   ‚úÖ {len(downloaded_data):,} total observations")
print(f"   ‚úÖ Complete {downloaded_data['date'].min().year}-{downloaded_data['date'].max().year} period")
print(f"   ‚úÖ {downloaded_data['permno'].nunique():,} unique stocks")
print(f"   ‚úÖ All required columns present")
print(f"   ‚úÖ Data ready for momentum analysis!")

üî¨ DATA QUALITY ANALYSIS

üìâ Missing Values:
   ticker: 6,012 (0.64%)
   vol: 175,577 (18.72%)

üìä Data Statistics:
   Returns - Mean: 0.0187, Std: 0.1274
   Returns - Min: -0.8102, Max: 5.5625
   Prices - Min: $5.00, Max: $8675.00
   Market Cap - Min: $0.00M, Max: $102022.29M

üè¢ Exchange Distribution:
   NYSE (1): 386,700 (41.2%)
   AMEX (2): 187,485 (20.0%)
   NASDAQ (3): 363,823 (38.8%)

üìÖ Time Coverage:
   Total months: 300
   Avg observations per month: 3127
   Min observations per month: 1791
   Max observations per month: 4494

üéâ DATA QUALITY SUMMARY:
   ‚úÖ 938,008 total observations
   ‚úÖ Complete 1965-1989 period
   ‚úÖ 10,848 unique stocks
   ‚úÖ All required columns present
   ‚úÖ Data ready for momentum analysis!
