In [7]:
import numpy as np
import pandas as pd
import re

from fuzzywuzzy import process
pd.options.display.max_colwidth = 200

# Load 2018-2019 final allocations

In [8]:
final = pd.read_csv('C:\\Users\\scher\\school\\senior-thesis\\data\\rso_data_v2.txt', index_col=0)
final = final[final['Year']==2018].reset_index().drop(columns='index')
final

Unnamed: 0,Year,Organization,Type,Designation,Standing,Allocation
0,2018,BARE Magazine,PUB,Publications RSO,11,3100.0
1,2018,Berkeley Fiction Review,PUB,Publications RSO,35,1500.0
2,2018,Berkeley Poetry Review,PUB,Publications RSO,24,500.0
3,2018,The Berkeley Political Review,PUB,Publications RSO,17,7000.0
4,2018,Berkeley Science Review,PUB,Publications RSO,18,1000.0
...,...,...,...,...,...,...
418,2018,Volunteer Income Tax Assistance Program,SISG,Service RSO,5,600.0
419,2018,Wonderworks,SISG,Service RSO,8,600.0
420,2018,You Mean More,SISG,Health & Wellness RSO,7,1500.0
421,2018,Youth Empowerment Program,SISG,Service RSO,5,1500.0


# Load and clean 2018-2019 initial allocations

In [3]:
initial = pd.read_csv('C:\\Users\\scher\\school\\senior-thesis\\data\\reformatted\\2018-2019_initial.txt', 
                      delimiter='\t',
                      usecols=[1,3,5,7],
                      names=['Organization', 'Type', 'Standing', 'Allocation'])
initial = initial[initial['Type'].isin(['PUB','SAG','SISG'])]
initial['Standing'] = initial['Standing'].apply(int)

def clean_alloc(alloc):
    if alloc == 'Sponsorship Only' or alloc == 'Sponsorship only' or alloc == 'Sponsorsphip Only':
        return 0
    
    alloc = alloc.replace('$','')
    
    if ',' in alloc:
        alloc = alloc.replace(',','')
        
    return float(alloc)

initial['Allocation'] = initial['Allocation'].apply(clean_alloc)

initial

Unnamed: 0,Organization,Type,Standing,Allocation
10,BARE Magazine,PUB,10,3000.0
11,Berkeley Fiction Review,PUB,34,1500.0
12,"Berkeley Political Review, The",PUB,16,7000.0
13,Berkeley Science Review,PUB,17,1000.0
14,Cal Literature & Arts Magazine,PUB,19,2400.0
...,...,...,...,...
427,Volunteer Income Tax Association Program,SISG,4,600.0
428,Wonderworks,SISG,7,600.0
429,You Mean More,SISG,6,1500.0
430,Youth Empowerment Program,SISG,4,1500.0


In [4]:
def org_matcher(initial_name, final_df):
    best_match = process.extractOne(initial_name, final_df['Organization'])
    
    best_match_name = best_match[0]
    best_match_score = best_match[1]
    
    if best_match_score == 100:
        return best_match_name
    
    print(f'The initial name is: {initial_name}. \n The closest match is: {best_match_name}.')
    auth = input('authorize match?')
    if auth == 'y':
        return best_match_name
    else:
        return np.nan

In [5]:
initial['matched_names'] = initial['Organization'].apply(lambda x: org_matcher(x, final))

The initial name is: Berkeley Political Review, The. 
 The closest match is: The Berkeley Political Review.
authorize match?y
The initial name is: Heuristic Squelch, The. 
 The closest match is: The Heuristic Squelch.
authorize match?y
The initial name is: Smart Ass, The. 
 The closest match is: The Smart Ass.
authorize match?y
The initial name is: The Public Health Advocate. 
 The closest match is: Public Health Advocate.
authorize match?y
The initial name is: The Undergraduate Journal of Psychology at Berkeley. 
 The closest match is: Undergraduate Journal of Psychology at Berkeley.
authorize match?y
The initial name is: threads (formerly known as Al-Bayan). 
 The closest match is: Threads.
authorize match?y
The initial name is: A.S.T.R.O - Astronomer's Society for Teaching, Recreation and Outreach. 
 The closest match is: A.S.T.R.O. - Astronomer's Society for Teaching, Recreation and Outreach.
authorize match?y
The initial name is: Alpha Kappa Alpha Sorority, Incorporated. 
 The clo

authorize match?y
The initial name is: UC Berkeley Chi Epsilon. 
 The closest match is: Undergraduate Journal of Psychology at Berkeley.
authorize match?n
The initial name is: ULAB: Undergraduate Laboratory at Berkeley. 
 The closest match is: Undergraduate Laboratory at Berkeley.
authorize match?y
The initial name is: Women in Mathematics at Berkeley. 
 The closest match is: Womxn in Mathematics at Berkeley.
authorize match?y
The initial name is: 100 STRONG. 
 The closest match is: 100 Strong: Female Mentorship and Leadership Development.
authorize match?y
The initial name is: Alpha Phi Alpha Fraternity, Inc.. 
 The closest match is: Alpha Kappa Psi.
authorize match?n
The initial name is: ASUC Renters' Legal Assistance. 
 The closest match is: Renters' Legal Assistance.
authorize match?y
The initial name is: Berkeley Cambodian Student Association. 
 The closest match is: Berkeley Cambodian Students Association.
authorize match?y
The initial name is: Berkeley Disaster Team [Formerly Be

In [7]:
initial[initial['matched_names'].isna()]

Unnamed: 0,Organization,Type,Standing,Allocation,matched_names
40,"Alpha Kappa Alpha Sorority, Incorporated",SAG,1,400.0,
59,Autonomous Underwater Vehicles Team at Berkeley,SAG,1,0.0,
95,CalGreeks Programming Council,SAG,2,500.0,
106,Coalition to Defend Affirmative Action By Any Means Necessary,SAG,19,3500.0,
123,Delta Xi Phi Multicultural Sorority Inc.,SAG,1,500.0,
124,Democratic Education at Cal,SAG,36,4500.0,
144,FEM Tech @ Berkeley (Female Empowerment and Mentoring in Tech),SAG,1,400.0,
156,Gamma Rho Lambda,SAG,1,400.0,
157,Gates Millennium Scholars Association,SAG,11,950.0,
160,Golden Women,SAG,1,400.0,


In [67]:
# Drop RSOs that are either part of the Panhellenic Council or are currently an ASUC sponsored organization.
# initial = initial.drop([40,95,123,124,156,315,351,357,375,376,384,386,402,403,419])

In [69]:
initial.at[59, 'matched_names'] = 'Underwater Robotics at Berkeley'
initial.at[106, 'matched_names'] = 'BAMN - Coalition to Defend Affirmative Action, Integration, and Immigrant Rights and Fight for Equality By Any Means Necessary'
initial.at[144, 'matched_names'] = 'FEMTech'
initial.at[157, 'matched_names'] = 'Gates Millennium Student Association'
initial.at[160, 'matched_names'] = 'The Golden'
initial.at[165, 'matched_names'] = 'Somali, Ethiopian, Eritrean, South Sudanese, Sudanese Association (SEE§A)'
initial.at[192, 'matched_names'] = 'Students of Color Emerging in English'
initial.at[268, 'matched_names'] = 'Student Association for Applied Statistics'
initial.at[277, 'matched_names'] = 'Speak Out Now'
initial.at[284, 'matched_names'] = 'Berkeley Tennis Association'
initial.at[294, 'matched_names'] = 'Chi Epsilon - Civil Engineering Honor Society'
initial.at[382, 'matched_names'] = 'Mixed @ Berkeley Recruitment and Retention Center - MRRC'
initial.at[400, 'matched_names'] = 'Raíces Recruitment and Retention Center'

In [72]:
initial = initial.drop(
    'Organization', axis=1).rename(
    {'matched_names':'Organization'}, axis=1)

In [74]:
initial = initial.merge(
    final[['Organization','Designation']], 
    on='Organization')[['Organization', 'Type', 'Designation', 'Standing', 'Allocation']]

In [88]:
initial.to_csv('C:\\Users\\scher\\school\\senior-thesis\\data\\initial_allocs_2018.csv')

initial

Unnamed: 0,Organization,Type,Designation,Standing,Allocation
0,BARE Magazine,PUB,Publications RSO,10,3000.0
1,Berkeley Fiction Review,PUB,Publications RSO,34,1500.0
2,The Berkeley Political Review,PUB,Publications RSO,16,7000.0
3,Berkeley Science Review,PUB,Publications RSO,17,1000.0
4,Cal Literature & Arts Magazine,PUB,Publications RSO,19,2400.0
...,...,...,...,...,...
406,Volunteer Income Tax Assistance Program,SISG,Service RSO,4,600.0
407,Wonderworks,SISG,Service RSO,7,600.0
408,You Mean More,SISG,Health & Wellness RSO,6,1500.0
409,Youth Empowerment Program,SISG,Service RSO,4,1500.0


# Compare initial and final allocations

In [9]:
initial = pd.read_csv('C:\\Users\\scher\\school\\senior-thesis\\data\\initial_allocs_2018.csv', index_col=0)

In [10]:
df = initial.merge(
    final[['Organization','Standing','Allocation']], 
    how='outer',
    on='Organization').drop(
    'Standing_x', axis=1).rename(
    {'Standing_y':'Standing', 'Allocation_x':'Initial Allocation', 'Allocation_y': 'Final Allocation'}, axis=1)

# Drop duplicates
df = df.drop([177,178,180])

df

Unnamed: 0,Organization,Type,Designation,Initial Allocation,Standing,Final Allocation
0,BARE Magazine,PUB,Publications RSO,3000.0,11,3100.0
1,Berkeley Fiction Review,PUB,Publications RSO,1500.0,35,1500.0
2,The Berkeley Political Review,PUB,Publications RSO,7000.0,17,7000.0
3,Berkeley Science Review,PUB,Publications RSO,1000.0,18,1000.0
4,Cal Literature & Arts Magazine,PUB,Publications RSO,2400.0,27,2400.0
...,...,...,...,...,...,...
420,Songwriting at Berkeley,,,,2,100.0
421,Speech at Berkeley,,,,6,150.0
422,SPIRE,,,,4,150.0
423,REACT! at Berkeley,,,,3,400.0


In [11]:
total_initial = sum(df['Initial Allocation'].fillna(0))
total_final = sum(df['Final Allocation'].fillna(0))

print(f'The total initial allocation for RSOs was ${total_initial}, and ${total_final} was the final total.')
print(f'There was a ${total_final - total_initial} difference between the two.')

The total initial allocation for RSOs was $776186.0, and $792676.0 was the final total.
There was a $16490.0 difference between the two.


In [12]:
compare_df = df.dropna()
compare_df['Allocation Diff'] = compare_df['Final Allocation'] - compare_df['Initial Allocation']

compare_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Organization,Type,Designation,Initial Allocation,Standing,Final Allocation,Allocation Diff
0,BARE Magazine,PUB,Publications RSO,3000.0,11,3100.0,100.0
1,Berkeley Fiction Review,PUB,Publications RSO,1500.0,35,1500.0,0.0
2,The Berkeley Political Review,PUB,Publications RSO,7000.0,17,7000.0,0.0
3,Berkeley Science Review,PUB,Publications RSO,1000.0,18,1000.0,0.0
4,Cal Literature & Arts Magazine,PUB,Publications RSO,2400.0,27,2400.0,0.0
...,...,...,...,...,...,...,...
405,Volunteer Income Tax Assistance Program,SISG,Service RSO,600.0,5,600.0,0.0
406,Wonderworks,SISG,Service RSO,600.0,8,600.0,0.0
407,You Mean More,SISG,Health & Wellness RSO,1500.0,7,1500.0,0.0
408,Youth Empowerment Program,SISG,Service RSO,1500.0,5,1500.0,0.0


In [13]:
diff_allocs = compare_df[compare_df['Allocation Diff'] != 0]
diff_allocs

Unnamed: 0,Organization,Type,Designation,Initial Allocation,Standing,Final Allocation,Allocation Diff
0,BARE Magazine,PUB,Publications RSO,3000.0,11,3100.0,100.0
10,Perspective Magazine,PUB,Publications RSO,3500.0,21,3900.0,400.0
17,Public Health Advocate,PUB,Publications RSO,1100.0,14,1200.0,100.0
39,Arab Student Union,SAG,Cultural & Identity/International RSO,900.0,7,1300.0,400.0
64,Black Recruitment and Retention Center,SAG,Sponsored RSO,45000.0,36,40500.0,-4500.0
65,Black Student Union,SAG,Cultural & Identity/Cultural & Ethnic RSO,500.0,9,15500.0,15000.0
80,Cal Queer & Asian,SAG,Cultural & Identity/LGBTQ+ RSO,1100.0,9,1995.0,895.0
81,Cal Taiko,SAG,Performing Arts RSO,2000.0,14,2900.0,900.0
84,CalSlam,SAG,Performing Arts RSO,600.0,14,800.0,200.0
104,Danceworx,SAG,Performing Arts RSO,3000.0,22,3400.0,400.0


In [16]:
grouped_allocs = diff_allocs.groupby('Designation').mean()
grouped_allocs['Number of Orgs'] = diff_allocs.groupby('Designation').size()
grouped_allocs = grouped_allocs[['Standing','Initial Allocation','Final Allocation','Allocation Diff','Number of Orgs']]
grouped_allocs

Unnamed: 0_level_0,Standing,Initial Allocation,Final Allocation,Allocation Diff,Number of Orgs
Designation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Academic RSO,14.0,2881.25,3037.5,156.25,4
Cultural & Identity/Cultural & Ethnic RSO,9.0,500.0,15500.0,15000.0,1
Cultural & Identity/International RSO,7.0,900.0,1300.0,400.0,1
Cultural & Identity/LGBTQ+ RSO,29.5,9550.0,13497.5,3947.5,2
Health & Wellness RSO,4.0,500.0,600.0,100.0,1
Performing Arts RSO,14.5,1675.0,2075.0,400.0,4
Professional RSO,26.0,2500.0,3000.0,500.0,1
Publications RSO,15.333333,2533.333333,2733.333333,200.0,3
Service RSO,17.571429,2296.428571,2603.571429,307.142857,7
Sponsored RSO,25.875,19887.5,17898.75,-1988.75,8
