In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("../data/raw_analyst_ratings.csv")

df["date"] = pd.to_datetime(df["date"], errors="coerce")

df.head()


In [None]:
# data set summary
print("INFO:")
print(df.info())

print("\nSample rows:")
df.head()


In [None]:
# descriptive statistics
# headlinke length
df["headline_length"] = df["headline"].str.len()
print(df["headline_length"].describe())
# plot
df["headline_length"].plot(kind="hist", bins=40, figsize=(8,5),
                           title="Headline Length Distribution")
plt.xlabel("Length")
plt.show()
# publisher analysis
publisher_counts = df["publisher"].value_counts()

print("Top Publishers:")
print(publisher_counts.head(10))
# plot
publisher_counts.head(10).plot(kind="bar", figsize=(10,5),
                               title="Top 10 Publishers")
plt.xlabel("Publisher")
plt.ylabel("Articles")
plt.show()


In [None]:
# time series analysis
daily_counts = df.set_index("date").resample("D").size()

daily_counts.plot(figsize=(12,5), title="Daily Article Frequency")
plt.ylabel("Articles")
plt.show()
# day of week
df["day_of_week"] = df["date"].dt.day_name()

df["day_of_week"].value_counts().reindex([
    "Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"
]).plot(kind="bar", figsize=(10,5),
        title="Articles by Day of Week")

plt.ylabel("Articles")
plt.show()
# hour of day
df["hour"] = df["date"].dt.hour

df["hour"].value_counts().sort_index().plot(kind="bar", figsize=(12,5),
                                            title="Articles by Hour of Day")
plt.xlabel("Hour")
plt.ylabel("Articles")
plt.show()


In [None]:
import nltk

nltk.download("punkt")
nltk.download("punkt_tab")   
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")


In [None]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
# nlp tokenaization and lemmatization
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = str(text).lower()

    tokens = word_tokenize(text)

    tokens = [t for t in tokens if t.isalpha()]       # keep letters only
    tokens = [t for t in tokens if t not in stop_words]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]

    return tokens

df["tokens"] = df["headline"].apply(clean_text)

df[["headline", "tokens"]].head()


In [None]:
# top keywords
from collections import Counter

all_words = []

for token_list in df["tokens"]:
    all_words.extend(token_list)

keyword_counts = Counter(all_words).most_common(20)

pd.DataFrame(keyword_counts, columns=["keyword", "count"]).plot(
    x="keyword", y="count", kind="bar", figsize=(12,5),
    title="Most Common Keywords"
)

plt.show()
