In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

"""
This script processes AP College Poll Top 25 voting for every week in the seasons from 2014 to 2024. It iteratively gathers
poll data for all the weeks in a specified season and combines the data gathered from all seasons into a single merged CSV file.

Input: URLs corresponding to AP College Poll ballot data for each week from 2014 to 2024.

Output: 
1. One merged CSV file containing data from all weeks from 2014 to 2024.
"""


def author_ballot_dictionary(Weeks, url, year, short_szn=True):
    Dictionaries = []
    final_week_offset = 0

    #Iterate through weeks list and retrieves ballot data corresponding with the week 
    # at index i and the year that was passed as a function parameter
    for i in range(len(Weeks)):
        new_url = url + Weeks[i]
        response = requests.get(new_url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
        else:
            print("bad")
            continue

        #Accounts for the fact that short seasons do not have a 'week-16' and that 'final-rankings' 
        # should correspond with week 16 in the output csv file 'college_polls_original.csv'
        if short_szn and i == len(Weeks) - 2:
            final_week_offset = -1
            continue

        parent_element = soup.find('div', id='gridBallots')
        Dict = {}
        if parent_element:
            Rows = parent_element.find_all('div', class_='gridRow')
            for row in Rows:
                author_name = (row.find('div', class_='gridPollster').find('a'))['href']
                correct_author_name = author_name.split("/")[3]
                Team_rows = row.find_all('div', class_= 'gridTeam')

                #Skips data retrieval of a pollster's vote in the situation where the pollster does not submit their votes 
                if Team_rows[0].get('class')[1][3:] == 'blank':
                    continue

                team_names = [correct_author_name, year, i + 1 + final_week_offset]
                for item in Team_rows:
                    #Extracts team name from item which is a string of the form: gi_{team_name}, by dropping the first 3 characters
                    correct_team_name = item.get('class')[1][3:]
                    team_names.append(correct_team_name)

                Dict[correct_author_name] = team_names

        Dictionaries.append(Dict)

    return Dictionaries

def csv_data_appender_by_year(year): 
    Weeks = ['pre-season', 'week-2', 'week-3', 'week-4', 'week-5', 'week-6', 'week-7', 'week-8', 'week-9', 'week-10', 'week-11', 'week-12', 'week-13', 'week-14', 'week-15', 'week-16', 'final-rankings']
    url = "https://collegepolltracker.com/football/grid/" + year + "/"
    long_szns = ["2014", "2019", "2020"]
    
    #Ensures no extraneous empty files made for seasons that have 16 weeks instead of 17 weeks
    if year in long_szns:
        Dictionaries = author_ballot_dictionary(Weeks, url, year, short_szn=False)
    else:
        Dictionaries = author_ballot_dictionary(Weeks, url, year)

    #Iterates through dictionary containing individual votes for every week within the season and appends to the output
    #  csv file 'college_polls_original.csv'
    for dicto in Dictionaries:
        df = pd.DataFrame.from_dict(dicto, orient='index', columns=['Pollster', 'Season', 'Week', '1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th', '9th', '10th', '11th', '12th', '13th', '14th', '15th', '16th', '17th', '18th', '19th', '20th', '21st', '22nd', '23rd', '24th', '25th'])
        df.to_csv('college_polls_original.csv', mode='a', index=False, header=True)


def csv_creation():
    years = ["2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023", "2024"]
    
    #Clears the output csv file 'college_polls_original.csv' because the file write mode is set to append, ensuring that
    #  no undesired content is in the file
    f = open("college_polls_original.csv", "w")
    f.truncate()
    f.close()

    for year in years:
        csv_data_appender_by_year(year)

csv_creation()