# Time Series Analysis

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter

# Load data
df = pd.read_csv('C:/raw_analyst_ratings/raw_analyst_ratings.csv', parse_dates=['date'])

# Convert date column properly
try:
    # First try ISO8601 format
    df['date'] = pd.to_datetime(df['date'], format='ISO8601')
except ValueError:
    try:
        # Try mixed formats if ISO8601 fails
        df['date'] = pd.to_datetime(df['date'], format='mixed')
    except ValueError:
        # Fallback to inferring with dayfirst
        df['date'] = pd.to_datetime(df['date'], dayfirst=True)

#### Publication Frequency Analysis

In [3]:
time_series = df.set_index('date').resample('D').size()

#### Identify Top Spikes

In [4]:
spike_threshold = time_series.mean() + 2 * time_series.std()
spikes = time_series[time_series > spike_threshold]

#### Hourly Publication Pattern

In [5]:
hourly_pattern = df['date'].dt.hour.value_counts().sort_index()

#### Analysis Tables

In [6]:
publication_stats = pd.DataFrame({
    'Time Period': [f"{df['date'].min().date()} to {df['date'].max().date()}"],
    'Total Articles': [len(df)],
    'Average Daily Publications': [round(time_series.mean(), 1)],
    'Peak Daily Publications': [time_series.max()],
    'Most Active Hour': [hourly_pattern.idxmax()],
    'Articles During Peak Hour': [hourly_pattern.max()],
    'Significant Spikes (>2σ)': [len(spikes)]
})

spike_dates = pd.DataFrame({
    'Date': spikes.index.date,
    'Article Count': spikes.values,
    'Deviation from Mean': [f"+{(x-time_series.mean())/time_series.std():.1f}σ" for x in spikes.values]
})

#### Display tables

In [7]:
print("PUBLICATION FREQUENCY STATISTICS")
print("--------------------------------")
display(publication_stats.style.hide(axis='index'))

print("\nTOP PUBLICATION SPIKES")
print("----------------------")
display(spike_dates.head(10).style.hide(axis='index'))

PUBLICATION FREQUENCY STATISTICS
--------------------------------


Time Period,Total Articles,Average Daily Publications,Peak Daily Publications,Most Active Hour,Articles During Peak Hour,Significant Spikes (>2σ)
2009-02-14 to 2020-06-11,1407328,340.3,2739,0,1351408,93



TOP PUBLICATION SPIKES
----------------------


Date,Article Count,Deviation from Mean
2009-08-10,1130,+2.8σ
2011-05-23,930,+2.1σ
2011-07-28,1042,+2.5σ
2016-04-28,911,+2.0σ
2016-08-04,943,+2.1σ
2016-10-27,931,+2.1σ
2017-11-02,1036,+2.4σ
2018-07-25,1017,+2.4σ
2018-07-26,1146,+2.8σ
2018-07-31,922,+2.0σ


#### Hourly Analysis Table

In [8]:
hourly_table = pd.DataFrame({
    'Hour (UTC-4)': hourly_pattern.index,
    'Publication Count': hourly_pattern.values,
    'Percentage': [f"{(x/len(df)*100):.1f}%" for x in hourly_pattern.values]
})

print("\nHOURLY PUBLICATION PATTERN")
print("-------------------------")
display(hourly_table.style.hide(axis='index'))


HOURLY PUBLICATION PATTERN
-------------------------


Hour (UTC-4),Publication Count,Percentage
0,1351408,96.0%
1,14,0.0%
2,57,0.0%
3,93,0.0%
4,1469,0.1%
5,1829,0.1%
6,2476,0.2%
7,5033,0.4%
8,5527,0.4%
9,5965,0.4%
