In [1]:
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
import time
import os

In [2]:
from plotly.graph_objs import Bar, Layout
from plotly import offline
import plotly.express as px
import plotly.graph_objects as go

In [3]:
colleges = pd.read_csv('/Users/kevins/Documents/CSImpact/data/colleges.csv')

In [4]:
data = pd.read_csv('/Users/kevins/Documents/CSImpact/data/cleaned_prof_data_v4.csv')

In [15]:
college_data = pd.DataFrame()

def plot(college):
    #remove spaces from college names
    college_no_spaces = college.replace(' ','_').replace('/', '_')
    
    #Get the index of the college
    i = colleges.index[colleges['college'] == f'{college}'].tolist()[0]
    
    #get corresponding csv file
    college_data = pd.read_csv('/Users/kevins/Documents/CSImpact/data/college_data/college'+str(i)+'.csv')
    
    #get rows with missing data
    missing_id_rows = college_data.loc[(pd.isna(college_data['h-index'])) & (pd.isna(college_data['citations']))]
    missing_profs = missing_id_rows['name']
    #get rows with good data
    rows_to_plot = college_data.loc[(~(pd.isna(college_data['h-index'])) & ~(pd.isna(college_data['citations'])))]
    
    #number of professors/rows
    rows = college_data.shape[0]
    
    
    
    #print(missing_profs)
    
    # turn missing data into y value of 1 to put at the bottom of graph 
    missing_h_indexes = []
    missing_citations = []
    
    for p in missing_profs:
        missing_h_indexes.append(1)
        missing_citations.append(1)
    #print(missing_h_indexes)
    
    #set row width based on amount of data
    if(rows>5):
        fig_width = rows * 20 + 100
    else:
        fig_width= 400
        
    #sort data smallest to largest
    rows_to_plot = rows_to_plot.sort_values('h-index')
    
    #get stats of data (mean, etc.)
    desc = rows_to_plot.describe()
    
    
    
    
    #PLOTTING H-INDEX
    
    
    
    
    #get mean from description
    h_indx_mean = round(float(desc.iloc[1,1]),1)
    #print(h_indx_mean)
    
    #get good data, names
    h_indexes = rows_to_plot['h-index']
    names = rows_to_plot['name']
    
#     print(names)
#     print(h_indexes)
    
    #plot bad data first at the bottom
    fig = px.scatter(x = missing_profs, y = missing_h_indexes, labels = {'x':'name', 'y':'h-index'},title = f'{college}', width = fig_width, height = 1000, range_y = [1,320], range_x = [-1,rows],color_discrete_sequence=['red'], log_y = True)
    fig.update_layout(
    yaxis = dict(
        #log scale
        tickmode = 'array',
        tickvals = [0,5,10, 20,40,80,160, 320]
    )
)
    #add labels to bad data saying 'NO DATA' when you hover over the  points
    fig.update_traces(hovertemplate='NO DATA')
    
    # the annotation for the average line disappears for some reason so get the x-value to add it manually
    if(pd.isna(missing_id_rows.iloc[0,0])):
        annotation_x = rows_to_plot.iloc[0,0]
    else:
        annotation_x = missing_id_rows.iloc[0,0]
    
    # plot average line
    if(not pd.isna(h_indx_mean)):
        fig.add_hline(y=h_indx_mean, line_color = 'black',annotation_text=f'Average = {h_indx_mean}', annotation_position = "bottom")
        fig.add_trace(go.Scatter(
            x=[missing_id_rows.iloc[0,0]],
            y=[h_indx_mean],
            mode="lines+text",
            text=[f'Average = {h_indx_mean}'],
            textposition="top right"
        ))
    else:
        h_indx_mean = 0
    
    # add subtitle displaying average
    fig.update_layout(title = go.layout.Title(text = f'{college} <br><sup>Average h-index for {rows_to_plot.shape[0]} professors: {h_indx_mean}</sup>', xref = 'paper', x = 0), font=dict(
               family="Courier New, monospace",
               size=10,
               color="#000000"
           )
        )
    
    # plot good data
    fig.add_trace(
    go.Scatter(
        x=names,
        y= h_indexes,
        mode ='markers', name = '',marker = dict( color = 'black'))
    )
    
    # hide the legend
    fig.update_layout(showlegend=False)
    
    #fig.show()
    fig.write_html(f'/Users/kevins/Documents/CSImpact/data/college_graphs/{college_no_spaces}/{college_no_spaces}_h_index.html')

    
    # PLOTTING CITATIONS
    #basically the same logic as plotting h-index
    
    cit_mean = round(float(desc.iloc[1,0]),1)
    #print(cit_mean)
    
    rows_to_plot = rows_to_plot.sort_values('citations')
    names = rows_to_plot['name']
    
    citations = rows_to_plot['citations']

        
    fig = px.scatter(x = missing_profs, y = missing_citations, labels = {'x':'name', 'y':'citations'},title = f'{college}', width = fig_width, height = 1000, range_y = [1,325000], range_x = [-1,rows],color_discrete_sequence=['red'],log_y = True)
    fig.update_layout(
    yaxis = dict(
        tickmode = 'array',
        tickvals = [0,33,325, 3250,32500,325000]
    )
)
    
    fig.update_traces(hovertemplate='NO DATA')

    
    if(pd.isna(missing_id_rows.iloc[0,0])):
        annotation_x = rows_to_plot.iloc[0,0]
    else:
        annotation_x = missing_id_rows.iloc[0,0]
        
    #print(not pd.isna(cit_mean))
    if(not pd.isna(cit_mean)):
        fig.add_hline(y=cit_mean, line_color = 'black',annotation_text=f'Average = {cit_mean} citations', annotation_position = "top left")
        fig.add_trace(go.Scatter(
            x=[annotation_x],
            y=[cit_mean],
            mode="lines+text",
            text=[f'Average = {cit_mean}'],
            textposition="top right"
        ))
    else:
        cit_mean = 0
    fig.update_layout(title = go.layout.Title(text = f'{college} <br><sup>Average citations for {rows_to_plot.shape[0]} professors: {cit_mean}</sup>', xref = 'paper', x = 0), font=dict(
               family="Courier New, monospace",
               size=10,
               color="#000000"
           )
        )

    
    fig.add_trace(
    go.Scatter(
        x=names,
        y= citations,
        mode ='markers', name = '',marker = dict( color = 'black'))
    )
    fig.update_layout(showlegend=False)
    
    #fig.show()
    fig.write_html(f'/Users/kevins/Documents/CSImpact/data/college_graphs/{college_no_spaces}/{college_no_spaces}_citations.html')

In [14]:
plot('Carnegie Mellon University')

In [7]:
plot(f'{colleges.iloc[470,0]}')

In [16]:
#plot all colleges
for i in range(colleges.shape[0]):
    plot(f'{colleges.iloc[i,0]}')
    print(f'finished {i}')

finished AUEB


finished Aalborg University


finished Aalto University


finished Aarhus University


finished Aberystwyth University


finished Air Force Institute of Technology


finished American University in Cairo


finished American University of Beirut


finished Ariel University


finished Arizona State University


finished Auburn University


finished Augusta University


finished Australian National University


finished BITS Pilani


finished BITS Pilani-Goa


finished BUET


finished BUPT


ValueError: Cannot accept list of column references or list of columns for both `x` and `y`.

In [119]:
college_data = pd.read_csv(f'/Users/kevins/Documents/CSImpact/data/college_data/college46.csv')

In [120]:
college_data.describe()

Unnamed: 0,citations,h-index
count,163.0,163.0
mean,17525.527607,48.441718
std,25822.972547,28.289312
min,457.0,9.0
25%,4793.0,27.0
50%,11942.0,44.0
75%,19000.0,63.5
max,237097.0,179.0


In [55]:
colleges.head(50)

Unnamed: 0,college
0,AUEB
1,Aalborg University
2,Aalto University
3,Aarhus University
4,Aberystwyth University
5,Air Force Institute of Technology
6,American University in Cairo
7,American University of Beirut
8,Ariel University
9,Arizona State University


In [17]:
#create folders
for i in range(colleges.shape[0]):
    college_no_spaces = pd.read_csv(f'/Users/kevins/Documents/CSImpact/data/colleges.csv').iloc[i,0].replace(' ','_').replace('/', '_')
    os.mkdir(f'/Users/kevins/Documents/CSImpact/data/college_graphs/{college_no_spaces}')

In [107]:
colleges.loc[colleges['college']=='UNSW']

Unnamed: 0,college
340,UNSW
