<a href="https://colab.research.google.com/github/t-aridi/DS-4002/blob/main/SCRIPTS/4_time_series_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install statsmodels
!pip install pmdarima
!pip install prophet
!apt-get install git

Collecting pmdarima
  Downloading pmdarima-2.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl.metadata (7.8 kB)
Downloading pmdarima-2.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pmdarima
Successfully installed pmdarima-2.0.4
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git is already the newest version (1:2.34.1-1ubuntu1.12).
0 upgraded, 0 newly installed, 0 to remove and 29 not upgraded.


In [2]:

!git clone https://github.com/ArjunaBazaz/Presidential_Approval_Rating_Prediction.git

import os
os.chdir('Presidential_Approval_Rating_Prediction/DATA')


import glob

csv_files = glob.glob('*.csv')
print(csv_files)


Cloning into 'Presidential_Approval_Rating_Prediction'...
remote: Enumerating objects: 139, done.[K
remote: Counting objects: 100% (139/139), done.[K
remote: Compressing objects: 100% (125/125), done.[K
remote: Total 139 (delta 45), reused 58 (delta 13), pack-reused 0 (from 0)[K
Receiving objects: 100% (139/139), 2.38 MiB | 11.38 MiB/s, done.
Resolving deltas: 100% (45/45), done.
['approval_rating_obama_1_2.csv', 'approval_rating_reagan_1_2_updated.csv', 'approval_rating_clinton_1_2_updated.csv', 'approval_rating_carter_1.csv', 'approval_rating_johnson_1_2_updated.csv', 'approval_rating_bushjr_1_2_updated.csv', 'approval_rating_carter_1_updated.csv', 'approval_rating_truman_1_updated.csv', 'real_GDP_per_capita_daily_change.csv', 'approval_rating_roosevelt_3_4.csv', 'approval_rating_kennedy_1.csv', 'approval_rating_eisenhower_1_2_updated.csv', 'approval_rating_truman_1.csv', 'approval_rating_bushjr_1_2.csv', 'approval_rating_obama_1_2_updated.csv', 'approval_rating_bushsr_1.csv', 'a

Now we have imported the data we need from Github. Let's properly load our data and define a start and end date.

In [4]:
import pandas as pd

# -------------------------
# 1. Load Datasets and Parse Dates
# -------------------------

# Load the approval ratings dataset and convert dates (handle mixed formats)
approval_df = pd.read_csv('approval_rating.csv', parse_dates=['Start Date', 'End Date'])
approval_df['Start Date'] = pd.to_datetime(approval_df['Start Date'], errors='coerce')
approval_df['End Date'] = pd.to_datetime(approval_df['End Date'], errors='coerce')

# Load other datasets with their date columns
income_df = pd.read_csv('median_household_income_daily_change.csv', parse_dates=['observation_date'])
gdp_df = pd.read_csv('real_GDP_per_capita_daily_change.csv', parse_dates=['observation_date'])
sp500_df = pd.read_csv(
    'sp500_daily_change.csv',
    parse_dates=['Date'],
    date_parser=lambda x: pd.to_datetime(x, utc=True)
)
# Convert S&P 500 timestamps to tz-naive
sp500_df['Date'] = sp500_df['Date'].dt.tz_convert(None)
unemployment_df = pd.read_csv('unemployment_rate.csv', parse_dates=['observation_date'])

# -------------------------
# 2. Define the Common Date Range
# -------------------------
common_start = pd.Timestamp('1985-01-01')
common_end = pd.Timestamp('2023-01-01')

# It is best to restrict approval data to polls fully within the common period.
approval_df = approval_df[(approval_df['Start Date'] >= common_start) & (approval_df['End Date'] <= common_end)]

# -------------------------
# 3. Compute Weekly Weighted Approval Rating
# -------------------------
# Instead of expanding by week (which can misrepresent short polls), we compute,
# for each weekly bucket, the weighted average approval where weight equals the proportion
# of days in the poll period that fall within the week.

# Create a weekly index; here we choose weeks starting on Monday.
weekly_index = pd.date_range(start=common_start, end=common_end, freq='W-MON')
weekly_ratings = []  # to store computed weekly approval ratings

def get_overlap_days(poll_start, poll_end, week_start, week_end):
    # Compute the number of days overlap between poll period and the current week
    latest_start = max(poll_start, week_start)
    earliest_end = min(poll_end, week_end)
    delta = (earliest_end - latest_start).days + 1  # include both endpoints
    return max(0, delta)

# Iterate over each week and compute the weighted average approval rating
for week_start in weekly_index:
    week_end = week_start + pd.Timedelta(days=6)
    numerator = 0.0
    denominator = 0.0
    # For every poll, compute overlap with the current week
    for _, row in approval_df.iterrows():
        poll_start = row['Start Date']
        poll_end = row['End Date']
        # Skip polls that do not overlap with the week
        if poll_end < week_start or poll_start > week_end:
            continue
        overlap = get_overlap_days(poll_start, poll_end, week_start, week_end)
        if overlap <= 0:
            continue
        # Total duration of the poll in days (include both start and end)
        duration = (poll_end - poll_start).days + 1
        weight = overlap / duration
        numerator += weight * row['Approving']
        denominator += weight
    # If any poll contributed, calculate weighted average; else, leave as missing (None)
    week_rating = numerator / denominator if denominator > 0 else None
    weekly_ratings.append(week_rating)

# Create a Series for weekly approval ratings
approval_weekly = pd.Series(weekly_ratings, index=weekly_index, name='Approval_Rating')

# -------------------------
# 4. Resample and Process the Economic Datasets
# -------------------------
# For consistency, we resample these to weekly frequency, aligning on our W-MON index.

# S&P 500: Daily data → weekly average
sp500_weekly = sp500_df.set_index('Date')['Close_Change'].resample('W-MON').mean()

# GDP: Quarterly data → weekly via forward fill
gdp_weekly = gdp_df.set_index('observation_date')['Change'].resample('W-MON').ffill()

# Income: Annual data → weekly by linear interpolation
income_weekly = income_df.set_index('observation_date')['Change'].resample('W-MON').interpolate(method='linear')

# Unemployment: Monthly data → weekly via forward fill
unemployment_weekly = unemployment_df.set_index('observation_date')['UNRATE'].resample('W-MON').ffill()

# -------------------------
# 5. Merge the Datasets into a Single DataFrame
# -------------------------
merged_df = approval_weekly.to_frame().merge(
    sp500_weekly, left_index=True, right_index=True, how='left'
).merge(
    gdp_weekly, left_index=True, right_index=True, how='left'
).merge(
    income_weekly, left_index=True, right_index=True, how='left'
).merge(
    unemployment_weekly, left_index=True, right_index=True, how='left'
)

merged_df.columns = ['Approval_Rating', 'SP500_Change', 'GDP_Change', 'Income_Change', 'Unemployment_Rate']

# -------------------------
# 6. Handle Any Remaining Missing Data
# -------------------------
# Print missing value counts before filling:
print("Missing Values Before Handling:")
print(merged_df.isnull().sum())

# For polls, if a week has no poll data, you may want to interpolate using time-aware interpolation.
merged_df['Approval_Rating'] = merged_df['Approval_Rating'].interpolate(method='time', limit=4)
# For any remaining gaps, fill forward then backward for approval ratings.
merged_df['Approval_Rating'] = merged_df['Approval_Rating'].fillna(method='ffill').fillna(method='bfill')

# For economic indicators, fill missing values similarly.
merged_df[['SP500_Change', 'GDP_Change', 'Income_Change', 'Unemployment_Rate']] = merged_df[
    ['SP500_Change', 'GDP_Change', 'Income_Change', 'Unemployment_Rate']
].ffill().bfill()

print("\nMissing Values After Handling:")
print(merged_df.isnull().sum())

# merged_df now contains a complete, weekly time series ready for time series analysis.


  sp500_df = pd.read_csv(


Missing Values Before Handling:
Approval_Rating      598
SP500_Change           0
GDP_Change             0
Income_Change        260
Unemployment_Rate      0
dtype: int64

Missing Values After Handling:
Approval_Rating      0
SP500_Change         0
GDP_Change           0
Income_Change        0
Unemployment_Rate    0
dtype: int64


  merged_df['Approval_Rating'] = merged_df['Approval_Rating'].fillna(method='ffill').fillna(method='bfill')
