# My functions
contains frequently used functions

In [1]:
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 18})
import pandas as pd
import difflib 
#import country_converter as coco
import statsmodels.api as sm
from scipy import stats



In [2]:
def clean_data(df, cols):
    """
    clean data and only keep df with positive values in columns.
    """      
    df[cols] = df[df[cols] > 0][cols]
    
    return df.dropna() 


In [3]:
def fit_scaling(xdata, ydata, plot = 1):
    """
    Fit scaling relationship to data
    """
    beta, c =np.polyfit(np.log(xdata), np.log(ydata), 1)
    
    if plot==1:
        x = np.logspace(np.log10(np.min(xdata)), np.log10(np.max(xdata)), 50)
        plt.plot(x, x**beta*np.exp(c), "-")
    return beta, c
    
def plotScaling(df, xvar, yvar, labelVar, annotateList = [], plotLine = 1, dotmarker = "o", delta = 1, annotateFont = 12, betaTextAdjust = 0.8): 
    """
    Plot scaling relationship
    """
    cleaned = df[[xvar, yvar]].dropna()
    xdata = cleaned[xvar]
    ydata = cleaned[yvar]
    
    plt.loglog(xdata, ydata, dotmarker) # plot data 

    beta, c = fit_scaling(xdata, ydata, plot = plotLine) # get lin fit. 
    
    if len(labelVar)> 0: 
        for i, row in df.iterrows(): 
            if row[labelVar] in(annotateList):
                plt.annotate(row[labelVar], (row[xvar] + delta, row[yvar] + delta), fontsize = annotateFont)
    plt.text(df[xvar].min(), df[yvar].max()*betaTextAdjust, "Exponent = {:.2f}".format(beta) )
    return beta, c


In [4]:
    
def plotLinear(df, xvar, yvar, labelVar, annotateList = [], plotLine = 1, dotmarker = "o", delta = 1, annotateFont = 12, betaTextAdjust = 0.8): 
    """
    Clean data and plot linear relationship between 2 variables. Selectively annotate or fit a line through the data. 
    """
    cleaned = df[[xvar, yvar]].dropna()
    xdata = cleaned[xvar]
    ydata = cleaned[yvar]
    
    plt.plot(xdata, ydata, dotmarker) # plot data 
    beta, c = np.polyfit(xdata, ydata, 1)
        
    if len(labelVar)> 0: 
        for i, row in df.iterrows(): 
            if row[labelVar] in(annotateList):
                plt.annotate(row[labelVar], (row[xvar] + delta, row[yvar] + delta), fontsize = annotateFont)
    
    if plotLine == 1:
        x = np.linspace(np.min(xdata), np.max(xdata), 50)
        plt.plot(x, x*beta +c, "-")
        
    #plt.text(df[xvar].min(), df[yvar].max()*betaTextAdjust, "beta = {:.2f}".format(beta) )
    print("best fit equations: y = {:.2f} x + {:.2f}".format(beta, c) )
    
    return beta, c

In [5]:
# fuzzy match cities
def fuzzy_match(x, dataCol):
    """
    find fuzzy match of x in data column. 
    if there is a good match, keep first one. If there isn't, return nan.    
    """
    matchList = difflib.get_close_matches(x, dataCol)
    if len(matchList)>0: 
        match = matchList[0]
    else: 
        match = np.nan
    return match



In [6]:
# for residual analysis
def compute_log_residual(df, xvar, yvar, beta, c): 
    logx = np.log(df[xvar])
    logy = np.log(df[yvar])
    logy_pred = logx*beta + c
    logy_residual = logy - logy_pred
    return logy_residual

In [7]:
# for OLS analysis

def plot_descriptives(df, xval, yval, zval, annotateVal, cityList, xlabel, ylabel, zlabel, dotmarker = "o"):
    """
    Plot descriptives of 
    (1) Scaling of x with y
    (2) x with z (connectivity var)
    """
    plt.figure(figsize = (11, 5))

    plt.subplot(121)
    slope1, c1 = plotScaling(df, xval, yval, annotateVal,cityList, dotmarker = dotmarker)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)

    plt.subplot(122)
    slope2, c2 = plotScaling(df, xval, zval, annotateVal, cityList, dotmarker = dotmarker)
    plt.xlabel(xlabel)
    plt.ylabel(zlabel)

    plt.subplots_adjust(wspace =0.3)
    
    
def store_results(results):
    out = {"params": results.params, "aic": results.aic, "bic": results.bic, "CI95": results.conf_int(alpha = 0.05), 
          "r_sq": results.rsquared, "r_sq_adj": results.rsquared_adj, "standar_err": results.bse}
    return out 

def run_null_model(df, xvar, yvar):
    """
    Run the null regression model. 
    """
    Y = np.log(df[yvar])
    X = np.log(df[xvar])
    X = sm.add_constant(X)
    
    model = sm.OLS(Y,X)
    results = model.fit()
    
    out = store_results(results)
        
    return out

def run_gnc_model(df, x1var, x2var, yvar):
    """
    run GNC model: 
    log y = beta logx + alpha logy + c
    """
    Y = np.log(df[yvar])
    X = np.log(df[[x1var, x2var]])
    X = sm.add_constant(X)
    
    model = sm.OLS(Y,X)
    results = model.fit()
    
    out = store_results(results)

    return out

In [8]:
def descriptive_scaling(df, total_col, col , xvar, gdpVar, gncVar, annotateList = [], dotmarker = ".", ylable1= 'GDP'):
    """
    Plot GDP vs pop scaling on left panel, and GNC vs. pop scaling on right panel
    """
    #plt.figure(figsize = (10, 5))
    plt.subplot(2, total_col, col )
    plotScaling(df, xvar, gdpVar, "city_gnc", annotateList = annotateList, dotmarker = dotmarker)
    plt.xlabel("Population")
    plt.ylabel(ylable1)
    
    plt.subplot(2, total_col, 3+col)
    plotScaling(df, xvar, gncVar, "city_gnc",  annotateList = annotateList,  dotmarker = dotmarker)
    plt.xlabel("Population")
    plt.ylabel("GNC")
    
