In [1]:
SCHOOL="berkeley"
SUBJECT="opinion"
START_YEAR=2010
FINAL_YEAR=2022
DATA_DIR="data"      # should be 'data'
OUTPUT_DIR="output"  # should be 'output'

In [2]:
%pwd

'/Users/vibhavirmani/Desktop/ASDRP/diversity-colleges/notebooks'

In [3]:
%cd ..

/Users/vibhavirmani/Desktop/ASDRP/diversity-colleges


In [4]:
import sys
sys.path.append('src')
import ouraws
import ourgraphs
import textutil


In [5]:
# NOTE: before loading, we need to be in the "diversity-colleges" folder

S3OBJECT_KEY = f"{DATA_DIR}/{SCHOOL}-{SUBJECT}-SNAPSHOT.parquet"
df = ouraws.getFromS3(S3OBJECT_KEY)

In [6]:
df.shape

(6095, 6)

In [7]:
df.head(2)

Unnamed: 0,title,url,body,year,month,day
0,Opening up,https://dailycal.org/2022/12/08/opening-up,Opening up\nThe end of the semester always com...,2022,12,8
1,A look into economic disparity among Asian Ame...,https://dailycal.org/2022/12/07/a-look-into-ec...,A look into economic disparity among Asian Ame...,2022,12,7


In [8]:
import requests
import os

BIPARTISAN_API_KEY = os.environ.get("BIPARTISAN_API")

BIPARTISAN_URL = "https://api.thebipartisanpress.com/api/endpoints/beta/robert"

In [9]:
df2022 = df[df['year'] == 2022]

articles_list = df2022['body'].to_list()
for article in articles_list:
    payload = {"API": BIPARTISAN_API_KEY, "Text": article.encode("utf-8")}
    response = requests.post(BIPARTISAN_URL, data=payload)
    print(response.text)
    break

400 Bad Request: The browser (or proxy) sent a request that this server could not understand.


In [10]:
df['year'].unique()

array([2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012,
       2011])

In [13]:
# SYNCHRONOUS METHOD : TAKES A LONG TIME

results = []

for year in range(2010, 2023):
    df2 = df[df['year'] == year]
    print(f"Year: {year} ... {df2.shape[0]}")

    articles_list = df2['body'].to_list()
    value_sum = 0.0
    article_count = 0
    for article in articles_list:
        payload = {"API": BIPARTISAN_API_KEY, "Text": article.encode("utf-8")}
        response = requests.post(BIPARTISAN_URL, data=payload)
        try:
            value_sum += float(response.text)
            article_count += 1
            # print(f"{article_count}", end=".")
        except:
            pass  # this is a non-number
        
    results.append({
        'year' : year,
        'article_count' : article_count,
        'polarity_sum' : value_sum,
        'polarity_avg' : float(value_sum) / article_count
    })

    print(f"{year}\t{article_count}\t{value_sum}\t{value_sum/article_count}")
# for each year, get all articles' bodies & send to bipartisan press API 
#    and sum up the values & output the average for each year

Year: 2010 ... 0


ZeroDivisionError: float division by zero

In [None]:
import pandas as pd

RESULTS_FILE = f"{DATA_DIR}/{SCHOOL}-POLARITY.parquet"

results_df = pd.DataFrame.from_records(results)
results_df.to_parquet(RESULTS_FILE)

NameError: name 'results' is not defined

In [16]:
# ASYNCHRONOUS METHOD : should be a lot faster
import asyncio

results = []

for year in range(2015, 2020):
    df2 = df[df['year'] == year]
    print(f"Year: {year} ... {df2.shape[0]}")

    articles_list = df2['body'].to_list()
    value_sum = 0.0
    article_count = 0
    for article in articles_list:
        payload = {"API": BIPARTISAN_API_KEY, "Text": article.encode("utf-8")}
        response = requests.post(BIPARTISAN_URL, data=payload)
        try:
            value_sum += float(response.text)
            article_count += 1
            # print(f"{article_count}", end=".")
        except:
            pass  # this is a non-number
        
    results.append({
        'year' : year,
        'article_count' : article_count,
        'polarity_sum' : value_sum,
        'polarity_avg' : float(value_sum) / (article_count+1)
    })

    print(f"{year}\t{article_count}\t{value_sum}\t{value_sum/(article_count+1)}")
# for each year, get all articles' bodies & send to bipartisan press API 
#    and sum up the values & output the average for each year

Year: 2015 ... 583


In [None]:
import asyncio
import aiohttp

async def fetch(session, url, data):
    async with session.post(url, data=data) as response:
        return await response.text()

async def process(articles_list):
    async with aiohttp.ClientSession() as session:
        tasks = [ fetch(session, BIPARTISAN_URL, 
                            {
                                "API": BIPARTISAN_API_KEY, 
                                "Text": article.encode("utf-8")
                            }
                       ) 
                  for article in articles_list
                ]
        responses = await asyncio.gather(*tasks)
        for response in responses:
            print(response)


In [None]:
df2 = df[df['year'] == 2022]
print(f"Year: {year} ... {df2.shape[0]}")

articles_list = df2['body'].to_list()

Year: 2010 ... 388


In [None]:
loop = asyncio.get_event_loop()
loop.run_until_complete(process(articles_list[:10]))
loop.close()