In [2]:
import pandas as pd
import os

"""
This script processes a CSV file containing AP College Poll Top 25 voting by grouping the data by 'Season' and 'Week'. 
It extracts the names of voters and voted teams (from the top 25 ranks) and saves the results as separate CSV files.

Input: A CSV file with AP College Poll Top 25 voting, including columns for voter names and team rankings 
('1st' to '25th').

Output: 
1.  A CSV file containing all unique voted teams and another for all voters.
"""

df = pd.read_csv('../../entire_ballot_data/college_polls_original.csv')

#Process and store Pollster column
all_voters = df["Pollster"].tolist()
all_unique_voters = sorted(set(all_voters))
all_unique_voters.remove("Pollster")
new_df = pd.DataFrame(all_unique_voters, columns=["Voters"])
new_df.to_csv("all_voters.csv", index=False)

#Iterate through rows and append teams from the relevant columns
columns_1_to_25 = ['1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th', '9th', '10th', '11th', '12th', '13th', '14th', '15th', '16th', '17th', '18th', '19th', '20th', '21st', '22nd', '23rd', '24th', '25th']
all_voted_teams = []
for col in columns_1_to_25:
    all_voted_teams.extend(df[col].tolist())

#Process and store teams from '1st' to '25th' column
all_unique_teams_without_ranking_nums = set(all_voted_teams).difference(set(columns_1_to_25))
all_unique_teams = sorted(all_unique_teams_without_ranking_nums)
new_df = pd.DataFrame(all_unique_teams, columns=["Voted Teams"])
new_df.to_csv("all_voted_teams.csv", index=False)