# Politics in Harvard

In [1]:
SCHOOL="harvard"
SUBJECT="opinion"
START_YEAR=2010
FINAL_YEAR=2022
TOPIC_EMBEDDING_MODEL = "all-MiniLM-L6-v2" # alt: "all-mpnet-base-v2"
DATA_DIR="data"      # should be 'data'
OUTPUT_DIR="output"  # should be 'output'

In [2]:
%pwd

'/Users/theodoremui/Library/CloudStorage/OneDrive-Personal/dev/diversity-colleges/notebooks'

In [3]:
%cd ..

/Users/theodoremui/Library/CloudStorage/OneDrive-Personal/dev/diversity-colleges


In [4]:
import sys
sys.path.append('src')
import ouraws
import ourgraphs
import textutil

In [5]:
# NOTE: before loading, we need to be in the "collegier" folder

S3OBJECT_KEY = f"{DATA_DIR}/{SCHOOL}-{SUBJECT}-SNAPSHOT.parquet"
df = ouraws.getFromS3(S3OBJECT_KEY)

In [6]:
df.shape

(5029, 6)

In [7]:
df.head(2)

Unnamed: 0,title,url,body,year,month,day
0,Let’s Talk about Religion,https://www.thecrimson.com/article/2023/1/25/l...,Let’s Talk about Religion\nBy Leah R. BaronLea...,2023,1,25
1,A Cambridge Winter,https://www.thecrimson.com/article/2023/1/25/e...,"A Cambridge Winter\nBy Emily N. Dial, Crimson ...",2023,1,25


In [30]:
import requests
import os

BIPARTISAN_API_KEY = os.environ.get("BIPARTISAN_API")

BIPARTISAN_URL = "https://api.thebipartisanpress.com/api/endpoints/beta/robert"

In [23]:
df2022 = df[df['year'] == 2022]

articles_list = df2022['body'].to_list()
for article in articles_list:
    payload = {"API": BIPARTISAN_API_KEY, "Text": article.encode("utf-8")}
    response = requests.post(BIPARTISAN_URL, data=payload)
    print(response.text)
    break

-14.532673


In [24]:
df['year'].unique()

array([2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013,
       2012, 2011, 2010, 2009, 2008, 2007, 2003, 1999, 1997, 1996, 1975,
       2004, 2002, 1960])

In [43]:
# TAKE A LOOOOOONG TIME
# loop through harvard's data from 2010 to 2022
for year in range(2010, 2023):
    df2 = df[df['year'] == year]
    print(f"Year: {year} ... {df2.shape[0]}")

    articles_list = df2['body'].to_list()
    value_sum = 0.0
    article_count = 0
    for article in articles_list:
        payload = {"API": BIPARTISAN_API_KEY, "Text": article.encode("utf-8")}
        response = requests.post(BIPARTISAN_URL, data=payload)
        try:
            value_sum += float(response.text)
            article_count += 1
            print(f"{article_count}", end=".")
        except:
            pass  # this is a non-number

    print(f"{year}\t{article_count}\t{value_sum}\t{value_sum/article_count}")
# for each year, get all articles' bodies & send to bipartisan press API 
#    and sum up the values & output the average for each year

Year: 2010 ... 473
1.2.3.4.5.6.7.8.9.10.11.12.13.14.15.16.17.18.19.20.21.22.23.24.25.26.27.28.29.30.31.32.33.34.35.36.37.38.39.40.41.42.43.44.45.46.47.48.49.50.51.52.53.54.55.56.57.58.59.60.61.62.63.64.65.66.67.68.69.70.71.72.73.74.75.76.77.78.79.80.81.82.83.84.85.86.87.88.89.90.91.92.93.94.95.96.97.98.99.100.101.102.103.104.105.106.107.108.109.110.111.112.113.114.115.116.117.118.119.120.121.122.123.124.125.126.127.128.129.130.131.132.133.134.135.136.137.138.139.140.141.142.143.144.145.146.147.148.149.150.151.152.153.154.155.156.157.158.159.160.161.162.163.164.165.166.167.168.169.170.171.172.173.174.175.176.177.178.179.180.181.182.183.184.185.186.187.188.189.190.191.192.193.194.195.196.197.198.199.200.201.202.203.204.205.206.207.208.209.210.211.212.213.214.215.216.217.218.219.220.221.222.223.224.225.226.227.228.229.230.231.232.233.234.235.236.237.238.239.240.241.242.243.244.245.246.247.248.249.250.251.252.253.254.255.256.257.258.259.260.261.262.263.264.265.266.267.268.269.270.271.272.2

In [38]:
import asyncio
import aiohttp

async def fetch(session, url, data):
    async with session.post(url, data=data) as response:
        return await response.text()

async def process(articles_list):
    async with aiohttp.ClientSession() as session:
        tasks = [ fetch(session, BIPARTISAN_URL, 
                            {
                                "API": BIPARTISAN_API_KEY, 
                                "Text": article.encode("utf-8")
                            }
                       ) 
                  for article in articles_list
                ]
        responses = await asyncio.gather(*tasks)
        for response in responses:
            print(response)


In [39]:
df2 = df[df['year'] == 2022]
print(f"Year: {year} ... {df2.shape[0]}")

articles_list = df2['body'].to_list()

Year: 2010 ... 388


In [None]:
loop = asyncio.get_event_loop()
loop.run_until_complete(process(articles_list[:10]))
loop.close()