In [1]:
import numpy as np
import pandas as pd
import re

!pip install fuzzywuzzy[speedup]
from fuzzywuzzy import fuzz, process



# Load data

In [None]:
all_data = pd.read_csv('data\\archived\\all_data.csv')

# Filters for RSOs, which have type 'PUB', 'SAG', or 'SISG'.
rso_labels = ['PUB', 'SAG', 'SISG']
rso_data = all_data[all_data['Type'].isin(rso_labels)].reset_index(drop=True)

# Fix data types

The first issue to address is the formatting in the `Allocation` column - particularly for the string values "Sponsorship Only" and "Sponsorship only".

In [None]:
# Filters for RSOs with sponsorships.
sponsored_rso_data = rso_data[(rso_data['Allocation'] == 'Sponsorship Only') |
                             (rso_data['Allocation'] == 'Sponsorship only')]

# Filters for RSOs allocations of $0.
unfunded_rso_data = rso_data[(rso_data['Allocation'] == '$0') |
                             (rso_data['Allocation'] == '$0.00')]

"Sponsorship Only" RSOs only appear in 2012, 2018, and 2019. Clubs allocated zero dollars only appear in 2013, 2014, 2015, 2016, and 2017, with the exception of four 2012 entries. Because the distributions of both categories are complementary, we can deduce that have similar meanings: While the clubs are still sponsored by the ASUC, they receive no funding from the ASUC. Thus, "Sponsorship Only" really means an allocation of $0.

In [None]:
# Replace "Sponsorship Only" RSOs with $0 of funding to make it consistent with the other allocation values.
rso_data.loc[:, 'Allocation'] = rso_data.loc[:, 'Allocation'].replace(['Sponsorship only', 'Sponsorship Only'], '$0')

In [None]:
# Remove '$' and ',' characters to convert the strings to float value.
rso_data.loc[:, 'Allocation'] = rso_data.loc[:, 'Allocation'].str.replace('$', '')
rso_data.loc[:, 'Allocation'] = rso_data.loc[:, 'Allocation'].str.replace(',', '')

# The STARR Referendum is mentioned in the 2018-2019 allocations; we'll save the names of these groups for reference.
rso_STARR = ['bridges Multicultural Resource Center',
            'Indigenous and Native Coalition Recruitment and Retention Center',
            'Middle Eastern North African Recruitment and Retention Center',
            'Mixed Student Union',
            'Pilipinx Academic Student Services',
            'RaÌ_ces Recruitment and Retention Center (formerly Raza Recruitment and Retention Center)',
            'REACH! Asian Pacific American Recruitment and Retention Center']

# The following line confirms this information.
rso_data[(rso_data['Year'] == '2018') & (rso_data['Organization'].isin(rso_STARR))]

# Now back to cleaning the 'Allocations' column.
def clean_allocation(alloc):
    result = re.search(r'[0-9]+[.]*[0-9]*', alloc)
    if result:
        return result[0]

rso_data.loc[:, 'Allocation'] = rso_data.loc[:, 'Allocation'].apply(clean_allocation)
rso_data.loc[:, 'Allocation'] = rso_data.loc[:, 'Allocation'].astype(float)

In [None]:
# Convert 'Standing' from float to integer.
rso_data[['Standing']] = rso_data[['Standing']].astype(int)

# Clean CalLink designation data

In order to ensure that RSO names are consistent across budget years, each RSO can be matched to their CalLink names. We'll start by cleaning the CalLink data, `callink_data.txt`.

In [None]:
desig_data = pd.read_csv('data\\callink_data.txt')[['Organization', 'Designation']]

In [None]:
# When clubs don't add a profile picture to their CalLink page, they instead have a
# circle with the first letter of their org. The following code will filter this out of
# the webscraped names.
def ws_name_cleaner(name):
    return re.search(r'.\n(.*)', name)[1]

n_idx = desig_data[desig_data['Organization'].str.contains('\n')].index
desig_data.loc[n_idx, 'Organization'] = desig_data.loc[n_idx, 'Organization'].apply(ws_name_cleaner)

In [None]:
# Some campus departments also have asterisks around their organization names; filter those out.
def ws_asterisk_cleaner(name):
    result = re.search(r'\*(.*)\*', name)
    if result != None:
        return result[1]
    return name

desig_data['Organization'] = desig_data['Organization'].apply(ws_asterisk_cleaner)

In [None]:
# Now let's clean the designations. Each one has an extra space after the name, so that'll
# be an easy fix.
def del_space(name):
    if name[-1] == ' ':
        return name[:-1]
    return name

desig_data['Designation'] = desig_data['Designation'].apply(del_space)

In [None]:
# We can now look at all the designations.
desig_count = desig_data.groupby('Designation').count().reset_index()
desig_count

In [None]:
# A list of designations with more than 10 organizations is shown below. 
valid_desigs = desig_count[desig_count['Organization'] >= 10]['Designation']
valid_desigs

In [None]:
# We can extract the typical RSO categories from them to create a list of "regular designations" for RSOs.
# We'll ignore the other designations for now.
reg_desigs = [i for i in valid_desigs if 'RSO' in i]
reg_desigs

In [None]:
# To account for outliers / inconsistent designations, we'll make a list of all the designations 
# that have less than 10 organizations under that category for closer examination.
odd_desigs = desig_count[desig_count['Organization'] < 10]
odd_desigs 

# There are definitely naming overlaps! Time to fix those.

In [None]:
# Editing the academic RSOs in 'odd_desigs'.
academic = odd_desigs[odd_desigs['Designation'].str.contains('Academic')]['Designation']
desig_data[desig_data['Designation'].isin(academic)]

# While Der Kreis and Energy and Resources Grad Students seem academic, iGEM appears to be
# more tech / project oriented. These orgs will be edited accordingly.
desig_data.at[[498, 534], 'Designation'] = 'Academic RSO'
desig_data.at[750, 'Designation'] = 'Technology RSO'

# Editing the cultural RSOs.
cultural = odd_desigs[odd_desigs['Designation'].str.contains('Cultural')]['Designation']
desig_data[desig_data['Designation'].isin(cultural)]

desig_data.at[745, 'Designation'] = 'Cultural & Identity/LGBTQ+ RSO'
desig_data.at[[1384, 1387], 'Designation'] = 'Cultural & Identity/Women Interest RSO'

# Environmental RSO.
desig_data[desig_data['Designation'] == 'Environmental']
desig_data.at[256, 'Designation'] = 'Environmental & Sustainability RSO'

# Professional RSO.
desig_data[desig_data['Designation'].str.contains('Professional RSO - ')]
desig_data.at[427, 'Designation'] = 'Professional RSO'

# Media RSO.
desig_data[desig_data['Designation'] == 'Media RSO']
desig_data.at[723, 'Designation'] = 'Media & Film RSO'

# Performing arts RSO.
desig_data[desig_data['Designation'] == 'Performing Arts']
desig_data.at[297, 'Designation'] = 'Performing Arts RSO'

# Political RSO.
desig_data[(desig_data['Designation'].str.contains('Political'))
          & (desig_data['Designation'] != 'Political & Advocacy RSO')]
desig_data.at[[58, 1138], 'Designation'] = 'Political & Advocacy RSO'

# Service RSO.
desig_data[(desig_data['Designation'].str.contains('Service'))
          & (desig_data['Designation'] != 'Service RSO')]
desig_data.at[[359, 449, 505, 520, 881, 961], 'Designation'] = 'Service RSO'

# Spiritual RSO.
desig_data[(desig_data['Designation'].str.contains('Spiritual'))
          & (desig_data['Designation'] != 'Spiritual RSO')]
desig_data.at[[201, 335, 789], 'Designation'] = 'Spiritual RSO'

# Sponsored RSO.
desig_data[desig_data['Designation'].str.contains('Sponsored')]
desig_data.at[[280, 306, 754, 904, 912, 1037, 1100, 1102], 'Designation'] = 'Sponsored RSO'

# It doesn't seem like pages with no real designation are relevant to RSOs, but
# let's unify the label.
blank = ['Inactive Page', 'No Response']
desig_data[desig_data['Designation'].isin(blank)]
desig_data.at[[100, 642, 964], 'Designation'] = 'None'

# Finally, we can extract RSO designations for future use. 
rso_desigs = desig_data[desig_data['Designation'].str.contains('RSO')]

# Merge designations with `rso_data`

In [None]:
rso_data = pd.merge(rso_data, rso_desigs, how='left', on='Organization')

rso_data['Designation'] = rso_data['Designation'].fillna('None')

# Rearranges column order.
cols = ['Year', 'Organization', 'Type', 'Designation', 'Standing', 'Allocation']
rso_data = rso_data[cols]

len(rso_data[rso_data['Designation'] == 'None'].index)

# Edit abbreviated RSO names

In [None]:
# Here, we'll manually address and edit the names of particular organizations with abbreviations.
# We generally want these edits to be consistent with 1) the group's most updated name, and
# 2) the group's name on CalLink, the official hub for Berkeley clubs (although this excludes
# former names of the club).

# Berkeley Opinion seems that it doesn't usually go by BerkOp - defaulting to the full name.
rso_data.loc[[827, 2176], 'Organization'] = 'Berkeley Opinion'
rso_data[rso_data['Organization'].str.contains('Berkeley Opinion')]


# OASES, while commonly known by the abbreviation, formally lists its organization under the
# full name, 'Oakland Asian Students Education Services'. We'll leave it as is.
rso_data[rso_data['Organization'].str.contains('Oakland Asian')]


# Movimiento Estudiantil Chicano/xicana de Atzlan doesn't tyically go by its abberviation, MEChxA.
rso_data.loc[[287,692], 'Organization'] = 'Movimiento Estudiantil Chicano/xicana de Atzlan'
rso_data[rso_data['Organization'].str.contains('Estudiantil')]

# Project SMILE has a pretty long official name.
rso_data.at[[298, 708, 1127, 1567, 2057, 2533], 'Organization'] = 'Project Spreading Multiculturalism and Inspiring Leadership through Education'
rso_data[rso_data['Organization'].str.contains('Project Spreading')]

# Chicano(a)s/Latino(a)s in Health Education has some odd mistypes
rso_data.loc[[261, 653, 1068, 2465], 'Organization'] = 'Chicano(a)s/Latino(a)s in Health Education'
rso_data[rso_data['Organization'].str.contains('Health Education')]

# Consult Your Community (CYC) at Berkeley
rso_data.at[456, 'Organization'] = 'Consult Your Community (CYC) at Berkeley'
rso_data[rso_data['Organization'].str.contains('CYC')]

# FEM Tech
rso_data.at[[1761, 2715], 'Organization'] = 'FEMTech'
rso_data[rso_data['Organization'].str.contains('FEM')]

# EGO
rso_data.at[[110, 476, 882, 1291, 1750], 'Organization'] = 'EGO (Cal\'s Traditional Korean Percussion Group)'
rso_data[rso_data['Organization'].str.contains('EGO')]

# iHeart
rso_data[rso_data['Organization'].str.contains('iHeart')]

# DULCE
rso_data.at[[660, 2480], 'Organization'] = 'DULCE (Diabetes: Unidos Lograremos Controlar Esta Enfermedad)'
rso_data[rso_data['Organization'].str.contains('DULCE')]

# Cal Literature and Arts Magazine
rso_data.at[[10, 762, 1181, 1623, 2112, 3022], 'Organization'] = 'Cal Literature & Arts Magazine'
rso_data[rso_data['Organization'].str.contains('Cal Literature')]

# Institute of Transportation Engineers at Berkeley
rso_data.at[[141, 429], 'Organization'] = 'Institute of Transportation Engineers at Berkeley'
rso_data[rso_data['Organization'].str.contains('Transportation Engineers')]

# Sigma Iota Rho
rso_data.at[1409, 'Organization'] = 'Sigma Iota Rho: International & Area Studies Honor Society'
rso_data[rso_data['Organization'].str.contains('Sigma Iota')]

# Fixes weird character errors for Raíces
rso_data.loc[3394, 'Organization'] = 'Raíces Recruitment and Retention Center'

# Match RSOs without a designation

The following functions use the `fuzzywuzzy` package to match RSOs without a designation to similarly named organization in `rso_desigs`. If the match is authenticated by the user, then the RSO name is replaced with its CalLink counterpart and assigned its designation.

In [None]:
# Helper functions for the main function, desig_assigner.

# Sifts through the possible designations before manually selecting a match. All decisions must be approved by the user.
def desig_selector(potential_desigs):
    for i in potential_desigs:
        print(i[0])
        auth = input('y/n?: ')

        if auth == 'y':
            return i
            
# Cleans and relabels names according to CalLink, as well as assigns designations.
def clean_name_desigs(rso_data, rso_desigs, org_idx, desig):
    org_name = rso_data['Organization'][org_idx]
    
    callink_name = desig[0]
    desig_idx = desig[2]
    designation = rso_desigs.at[desig_idx, 'Designation']
    
    if callink_name != org_name:
        rso_data.at[org_idx, 'Organization'] = callink_name
                    
        org_idxs = rso_data[rso_data['Organization'] == callink_name].index               
        rso_data.at[org_idxs, 'Designation'] = designation

In [None]:
# Main function for assigning designations.
def desig_assigner(rso_data, rso_desigs):
    data = rso_data[(rso_data['Designation'] == 'None')]
    manual_fixes = []
    
    for i in data.index:
        org = data['Organization'][i]
        
        print('Organization Name: ' + org + '\n' + '----------')
        print('Desig check:')
        
        potential_desig = process.extract(org, rso_desigs['Organization'], limit=2)
        desig = desig_selector(potential_desig)

        if desig != None:
            clean_name_desigs(rso_data, rso_desigs, i, desig)
            
        print('---------')

In [None]:
desig_assigner(rso_data, rso_desigs)

# Assign designations manually (section to be cleaned)

Unfortunately, many designations are still missing in `rso_data`. They'll have to be manually inputted based on extensively searching CalLink.

In [None]:
# 'Female Empowerment and Mentoring in Technology', 'Berkeley Formula Racing', '0 Comedy', 'smART'

# rso_data_2020[rso_data_2020['Organization'].str.contains('Women and Youth')]

# rso_data_2020.at[27, 'Organization'] = '3DMC'
# rso_data_2020.at[299, 'Designation'] = 'Arts RSO'

# test = pd.concat([rso_data, rso_data_2020])
# test[test['Organization'].str.contains('0')]

# rso_data = pd.concat([rso_data, rso_data_2020]).reset_index(drop=True)
# rso_data[rso_data['Organization'].str.contains('Hyperloop')]
rso_data_2020

In [None]:
# Import our modified rso_data.

rso_data = pd.read_csv('data\\archived\\rso_data.txt')
rso_data = rso_data[['Year', 'Organization', 'Type', 'Designation', 'Standing', 'Allocation']]

In [None]:
not_rso = ['Sigma Pi Alpha', 'Public Service Internships', 'Omega Phi Beta', 'Multi-Cultural Greek Council',
           'Lambda Theta Phi', 'Lambda Theta Nu', 'Lambda Theta Alpha', 'Delta Xi Phi', 'Gamma Rho Lambda',
           'Gamma Zeta Alpha', 'Greening the Greeks', 'Delta Sigma Theta', 'Alpha Phi Alpha', 'Alpha Kappa Alpha', 
           'Sigma Epsilon Omega', 'Sigma Omicron Pi', 'Sigma Psi Zeta']

re_not_rso = 'Sigma Pi Alpha|Public Service Internships|Omega Phi Beta|Multi-Cultural Greek Council|Lambda Theta Phi|Lambda Theta Nu|Lambda Theta Alpha|Delta Xi Phi|[G|g]amma [R|r]ho [L|l]ambda|Gamma Zeta Alpha|Greening [t|T]he Greeks|Delta Sigma Theta|Alpha Phi Alpha|Alpha Kappa Alpha|Sigma Epsilon Omega|Sigma Omicron Pi|Sigma Psi Zeta|Sigma Gamma Rho|UC Berkeley Public Service Center|National Pan-Hellenic|Democratic Education|CalGreeks|CalGreek|Lambda Upsilon Lambda|Phi Beta Sigma|Phi Sigma Rho|Order of Omega|CalTV|Open Computing Facility|Iota Phi Theta|Cal Corps|Sigma Omicron Pi|Alpha Kappa Lambda|Alpha Sigma Nu'

rso_data[(rso_data['Designation'] == 'None') & (~rso_data['Organization'].str.contains(re_not_rso))]
# rso_data[(rso_data['Standing'] == 15) & (rso_data['Year'] == 2017)]


In [None]:
# The following RSOs were unsearchable and were assigned designations based on educated inferences.
#'Golden Paw Productions', 'California Patriot', 'Take Back the Night', 'Save a Refugee',
# 'Disabled Students' Union', 'American Parliamentary Debate Society of Berkeley', 'Raza Caucus', 'Project Nutrition'
# 'Berkeley City College Service Community', 'Students for Liberty', 'Delta Sigma Pi', 'Stop the Traffick',
# 'Students for Integrative Medicine', 'Students Against Fracking', 'Malaria No More', 'League of Legends at Berkeley',
# 'Health in All Policies', 'Cal Set Design', 'Black Political Law Association',
# 'Camp WOW (Week of Wilderness)', 'Cal Facilitation Team', 'Understanding Physics', 'UC Berkeley Men's Ice Hockey',
# 'Student Human Resources Association', 'Student Commuters at Cal', 'Speech and Debate at Berkeley', 
# 'ProActive Wellness', 'American Marketing Association', 'Korean Student Organizations', 'Investment Banking Club',
# 'Eritrean Student Association', 'Chinese Instrumental Society', 'Blues at Berkeley', 'Bengali Students Association',
# 'Cal Health Coalition', 'Human Journal at Berkeley', 'Kappa Alpha Delta', 'Fitness at Berkeley',
# 'Mechanism of Feeling', 'Big Ideas@Berkeley', 'United Pre-Health Peers', 'Escape the Lights Social Club',
# 'Career Attainment in Responsible Enterprises', 'Engineering Innovation & Consulting', 'Berkeley Energy',
# 'Alumni of Ecole Polytechnique at Berkeley', 'Cal Scholars Chapter', 'Freshman Sophomore Entrepreneurial Society'
# 'Mak', 'International Student Fellowship', 'Multi-Ethnic Student House'

rso_data[(rso_data['Organization'].str.contains('International Student'))]
# idx = rso_data[(rso_data['Organization'].str.contains('Latinx Architecture'))].index

rso_data.at[1693, 'Organization'] = 'Berkeley Hyperloop'
# rso_data.at[502, 'Designation'] = 'Cultural & Identity/International RSO'


# rso_data[(rso_data['Designation'].str.contains('LGBTQ'))]

# rso_data.iloc[652]
# rso_data['Designation'].unique()

In [None]:
old_rso_data[old_rso_data['Organization'].str.contains('Ethiopian')]
# old_rso_data.iloc[876]

In [None]:
rso_data.to_csv('data\\archived\\rso_data.txt')

In [None]:
# 1938, 1939; 'Volunteers Around the World'; 'Cal Bhangra' 1772, 1773; 'Berkeley Energy' 1679; 1816, 1817; 
# 'Women in Economics' 1938, 1939; 'Fitness at Berkeley'; 'Dreamers Project Mentorship Program' 2478, 2479;
# 'Surf Team' 1911, 1912; 'Foodino'; 'Model United Nations' 2172, 2410; BareAbundance/Feeding Forward; 
# 'Korean- American Scientists' 1810, 1811; 3341


rso_data = rso_data[~rso_data['Organization'].str.contains(re_not_rso)]
# rso_data = rso_data.drop([1938, 1686, 1773, 1816, 2478, 1911, 1810, 628])
rso_data[rso_data['Designation'] == 'None']
# rso_data.loc[3341]
# rso_data.at[2172, 'Organization'] = 'Berkeley Model United Nations Conference'
# rso_data[rso_data['Organization'].str.contains('Feeding F')]

In [None]:
name_changes = rso_data[rso_data['Organization'].str.contains('former|Former|formely', regex=True)]
name_idx = name_changes.index

In [None]:
name_dict = {'Collegiate StarLeague at Berkeley': 'eSports at Berkeley', 
               'Asian American Health Society': 'Pre-Health Society',
               'International Association of Business Communicators': 'Berkeley Business Society',
               'Conscious Living Collective': 'Conscious Network',
               'Ethiopian Student Union': 'Horn of Africa Student Association',
               'Global Medical Brigades': 'Global Health Brigades',
               'Project Nutrition': 'Picknic',
               'Square Up': 'Golden Squares',
               'Italian International Student Association': 'Italian Society at Berkeley',
               'Al-Bayan': 'threads',
               'STOP the Traffick': 'Anti-Trafficking Coalition at Berkeley',
               'Stop the Traffick': 'Anti-Trafficking Coalition at Berkeley',
               'SIFE': 'Enactus of Berkeley',
               'Autism Speaks U at Berkeley': 'Spectrum: Autism at Cal',
               'Creative Marketing Group': 'CMG Strategy Consulting',
               'Californium Brewing and Winemaking': 'Ferments at Berkeley',
               'Juggers of the Seven Regents': 'Jugger at Berkeley',
               'Latino Pre-Law Society': 'Latinx Pre-Law Society',
               'Undergraduate Statistics Association': 'Statistics Undergraduate Students Association',
               'Undergraduate Statistics Association at Berkeley': 'Statistics Undergraduate Students Association',
               'Berkeley Disaster Response Team': 'Berkeley Disaster Team',
               'Raza Recruitment and Retention Center': 'Raíces Recruitment and Retention Center',
               'Raza Recruitment & Retention Center': 'Raíces Recruitment and Retention Center',
               'True Asian Leaders': 'Thrive Aspire Lead',
               'Krása Cosmetics Team': 'Aurum Cosmetics',
               '100 College Black Men, Berkeley Campus': 'Collegiate 100',
               'EPOCH': 'Product Development at Berkeley'
              }

In [None]:
# Jugger at Berkeley (@ idx 2751) has is missing parentheses and is fixed below.

rso_data.at[2751, 'Organization'] = 'Jugger at Berkeley (formerly known as Juggers of the Seven Regents)'

In [None]:
# Some test cases for clean_name.

a = 'asian american [formerly balh blah]'
b = 'threads (formerly known as Al-Bayan)'
c = 'Spectrum: Autism at Cal (Formerly known as Autism Speaks U at Berkeley) (Spectrum)'
e = 'Jugger at Berkeley formerly known as Juggers of the Seven Regents'
f = 'Enactus of Berkeley (formely SIFE)'
g = 'Engineering Student Council (formerly Engineer\'s Joint Council)'
h = 'Berkeley Business Society (formerly International Association of Business Comm'

clean_name(f)

In [None]:
# Filter out former naming tags.

rso_data.loc[name_idx, 'Organization'] = rso_data.loc[name_idx, 'Organization'].apply(clean_name)

In [None]:
# Apply name changes with name_dict.

def change_name(name):
    if name in name_dict:
        return name_dict[name]
    return name

rso_data.loc[:, 'Organization'] = rso_data.loc[:, 'Organization'].apply(change_name)

In [None]:
# Many RSOs also have abbreviations or alternative names.

shorthand = rso_data[rso_data['Organization'].str.contains('\(.*\)', regex=True)]

# It seems like there are inconsistencies with whether the groups go by the abbreviation or 
# the full name of the group. So we'll just have to cross-reference how the groups brand themselves
# over time and go with the most frequent use.

In [None]:
# # It seems that there are currently 1746 entries in the data that have no designation.
# # Time to fix that. We'll start by year and go backwards.

# unsure = rso_data.at[[3026, 3028], 'Organization']

# rso_data.at[3020, 'Organization'] = 'BerkTown Buzz'
# rso_data.at[[759, 3021], 'Organization'] = 'Blue and Gold Yearbook'
# rso_data.at[3027, 'Organization'] = 'Intercollegiate Finance Journal at Berkeley'


# # rso_data[rso_data['Organization'].str.contains('Latin')]
# # rso_desigs[rso_desigs['Organization'].str.contains('US')]

rso_data[(rso_data['Designation'] == 'None') & (rso_data['Year'] == '2019')]

In [None]:
# What about groups that absorb smaller groups?

anti_trafficking = ['Anti-Trafficking Coalition at Berkeley',
                    'Student Abolitionist Movement',
                    'International Justice Mission',
                    'International Justice Mission Campus Chapter at Berkeley'
                   ]

bridges = []

sustainable_housing_at_california = ['Tiny House in My Backyard', 'Solar Decathlon']
# Or groups that split up?

speech_and_debate = ['Speech at Berkeley', 'Parliamentary Debate at Berkeley']

# Groups that are now funded by the ASUC(?)

decal = rso_data[rso_data['Organization'].str.contains('DeCal')]

Somali, Ethiopian, Eritrean, South Sudanese, Sudanese Association (SEE§A) (formerly as Horn of Africa Student Association - HASA)

In [None]:
Phi Gamma Nu
KASA

In [None]:
# Removes 'formerly' tags from RSO names.

def clean_name(name):
    return re.search(r'(.*) [[(][Ff]or', name)[1]

def clean_the(name):
    return 'The ' + re.search(r'(.*), The', name)[1]

In [None]:
# formers_idx = rso_data[rso_data['Organization'].str.contains('[\(\[][Ff]or', regex=True)].index
# rso_data.loc[formers_idx, 'Organization'] = rso_data.loc[formers_idx, 'Organization'].apply(clean_name)

# the_idx = rso_data[(rso_data['Organization'].str.contains(', The')) & 
#          (~rso_data['Organization'].str.contains('Rishi|Therapists'))].index

# rso_data.loc[the_idx, 'Organization'] = rso_data.loc[the_idx, 'Organization'].apply(clean_the)
rso_data

In [None]:
consistent = rso_data.groupby(['Organization']).count()
consistent = consistent[consistent['Year'] == 4].sort_values('Year')
#consistent.index

#rso_data[(rso_data['Organization'].isin(consistent.index)) & (rso_data['Standing'] == 1)]
rso_data[(rso_data['Standing'] == 0) & (rso_data['Allocation'] != 0)]
rso_data[rso_data['Organization'] == 'Anthropology Undergraduate Association']

#rso_data[(rso_data['Standing'] == 1) & (rso_data['Year'] == '2016')]
#rso_data[rso_data['Organization'] == 'Cal Queer & Asian']

zero_standing = rso_data[rso_data['Standing'] == 0]['Organization']
zero_std_overall = rso_data[rso_data['Organization'].isin(zero_standing)].sort_values(['Organization', 'Year'])
zero_std_overall

# Drop duplicates

In [5]:
rso_data = pd.read_csv('data\\archived\\rso_data.txt', index_col=0)

In [10]:
rso_data = rso_data.drop_duplicates()

In [11]:
rso_data.to_csv('data\\archived\\rso_data.txt')

# Clean `Standing`

Only the 2016-2017 budget year has clubs with 0 `Standing`; these clubs are presumably new clubs. This is inconsistent with `Standing` in other years, however, where new RSOs begin with a standing of 1.

In [14]:
rso_data = pd.read_csv('data\\archived\\rso_data.txt', index_col=0)

In [17]:
# Create list of RSOs that had a standing of 0 in 2016. 
zero_standing_rsos = rso_data[rso_data['Standing']==0]['Organization'].to_numpy()

Because of complicated ASUC rules about legacy names, I've decided to only adjust `Standing` for 0 values and for budget years/standings that increase sequentially.

In [31]:
# Edits standing for organizations affected by having 0 standing in 2016.
def fix_standing(row, zero_standing_rsos):
    year = row['Year']
    organization = row['Organization']
    standing = row['Standing']
    
    if organization in zero_standing_rsos:
        if year == 2016 and standing == 0:
            return 1
        if year == 2017 and standing == 1:
            return 2
        if year == 2018 and standing == 2:
            return 3
        if year == 2019 and standing == 3:
            return 4
        if year == 2020 and standing == 4:
            return 5
    
    return standing

In [32]:
rso_data['Standing'] = rso_data.apply(lambda r: fix_standing(r, zero_standing_rsos), axis=1)

In [35]:
# Save a new version of rso_data to distinguish edits to `Standing`.
rso_data.to_csv('data\\rso_data_v2.txt')