# Publisher Analysis

In [4]:
import pandas as pd
from collections import Counter

# Load data
df = pd.read_csv('C:/raw_analyst_ratings/raw_analyst_ratings.csv', parse_dates=['date'])

# Top Publishers Analysis
publisher_stats = df['publisher'].value_counts().head(10).reset_index()
publisher_stats.columns = ['Publisher', 'Article Count']
publisher_stats['% of Total'] = (publisher_stats['Article Count'] / len(df) * 100)

# Content Categories
content_categories = {
    'Earnings': r'earnings|EPS|results|Q[1-4]|quarterly',
    'M&A': r'merger|acquisition|M&A|takeover|buyout',
    'Guidance': r'guidance|forecast|outlook|projection',
    'Analyst': r'analyst|rating|upgrade|downgrade|initiate|coverage',
    'Regulatory': r'FDA|SEC|regulator|approval|investigation',
    'Dividends': r'dividend|payout|yield|shareholder return',
    'Macro': r'economy|inflation|rates|Fed|central bank'
}

def enhanced_content_analysis(publisher_name):
    pub_df = df[df['publisher'] == publisher_name]
    total = len(pub_df)
    
    results = {}
    for category, pattern in content_categories.items():
        count = pub_df['headline'].str.contains(pattern, case=False).sum()
        results[category] = {
            'Count': count,
            '% of Publisher Content': f"{(count/total)*100:.1f}%",
            'vs Market Average': f"{(count/total - df['headline'].str.contains(pattern, case=False).sum()/len(df))*100:+.1f}%"
        }
    return pd.DataFrame(results).T

# Analyze top publishers
top_publishers = df['publisher'].value_counts().head(3).index.tolist()
content_tables = {}

for pub in top_publishers:
    content_tables[pub] = enhanced_content_analysis(pub)
    
# Display results
for publisher, table in content_tables.items():
    print(f"\nCONTENT SPECIALIZATION: {publisher.upper()}")
    print("----------------------------------------")
    display(table.style
           .background_gradient(subset=['Count'],)
           .set_properties(**{'text-align': 'left'}))

# top 3 publishers
top_publishers = publisher_stats['Publisher'].head(3).tolist()
content_analysis = pd.DataFrame(
    {pub: analyze_content(pub) for pub in top_publishers}
).T

# Email Domain Analysis 
if df['publisher'].str.contains('@').any():
    domains = df['publisher'].str.extract(r'@([\w.]+)')[0].value_counts().head(5)
    domain_stats = pd.DataFrame({
        'Email Domain': domains.index,
        'Article Count': domains.values
    })
else:
    domain_stats = pd.DataFrame({'Email Domain': ['No email addresses found'], 'Article Count': [0]})

# Display results
print("TOP PUBLISHERS ANALYSIS")
print("-----------------------")
display(publisher_stats.style.format({'% of Total': '{:.1f}%'}))


print("\nTOP EMAIL DOMAINS ")
print("--------------------------------")
display(domain_stats.style.hide(axis='index'))


CONTENT SPECIALIZATION: PAUL QUINTARO
----------------------------------------


Unnamed: 0,Count,% of Publisher Content,vs Market Average
Earnings,69817,30.6%,+11.6%
M&A,3454,1.5%,-0.4%
Guidance,9548,4.2%,+1.7%
Analyst,41620,18.2%,+4.7%
Regulatory,12006,5.3%,+0.9%
Dividends,4343,1.9%,+0.2%
Macro,2641,1.2%,-0.9%



CONTENT SPECIALIZATION: LISA LEVIN
----------------------------------------


Unnamed: 0,Count,% of Publisher Content,vs Market Average
Earnings,29619,15.8%,-3.2%
M&A,1428,0.8%,-1.1%
Guidance,1611,0.9%,-1.7%
Analyst,21933,11.7%,-1.8%
Regulatory,382,0.2%,-4.1%
Dividends,1151,0.6%,-1.1%
Macro,760,0.4%,-1.7%



CONTENT SPECIALIZATION: BENZINGA NEWSDESK
----------------------------------------


Unnamed: 0,Count,% of Publisher Content,vs Market Average
Earnings,37851,25.2%,+6.1%
M&A,1519,1.0%,-0.9%
Guidance,8090,5.4%,+2.8%
Analyst,7102,4.7%,-8.8%
Regulatory,12332,8.2%,+3.8%
Dividends,5321,3.5%,+1.8%
Macro,3121,2.1%,-0.0%


TOP PUBLISHERS ANALYSIS
-----------------------


Unnamed: 0,Publisher,Article Count,% of Total
0,Paul Quintaro,228373,16.2%
1,Lisa Levin,186979,13.3%
2,Benzinga Newsdesk,150484,10.7%
3,Charles Gross,96732,6.9%
4,Monica Gerson,82380,5.9%
5,Eddie Staley,57254,4.1%
6,Hal Lindon,49047,3.5%
7,ETF Professor,28489,2.0%
8,Juan Lopez,28438,2.0%
9,Benzinga Staff,28114,2.0%



TOP EMAIL DOMAINS 
--------------------------------


Email Domain,Article Count
benzinga.com,7937
gmail.com,139
andyswan.com,5
investdiva.com,2
tothetick.com,2
