In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import csv
from collections import Counter 
from collections import *
import math
import json
%load_ext jupyternotify

<IPython.core.display.Javascript object>

# Import Data from Google Sheets

In [120]:
from __future__ import print_function
import pickle
import os.path
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request

In [121]:
SCOPES = ['https://www.googleapis.com/auth/spreadsheets.readonly']

# Reference: https://developers.google.com/sheets/api/quickstart/python
def read_google_sheets(SPREADSHEET_ID, RANGE_NAME, HEADER_RANGE):
    creds = None
    # autogenerated
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)

    service = build('sheets', 'v4', credentials=creds)

    sheet = service.spreadsheets()
    result = sheet.values().get(spreadsheetId=SPREADSHEET_ID,
                                range=RANGE_NAME).execute()
    
    header = sheet.values().get(spreadsheetId=SPREADSHEET_ID,
                               range=HEADER_RANGE).execute()
    
    header_values = header.get('values', [])
    values = result.get('values', [])
    
    return values, header_values

In [122]:
SPREADSHEET_ID = '1AjynK9mMQTw58B_B8b_ZIip3fyUm-aoV7Pp21HziBb0'
RANGE_NAME = 'canto_codings!A2:AT'
HEADER_RANGE = 'canto_codings!A1:AT1'

data, header = read_google_sheets(SPREADSHEET_ID, RANGE_NAME, HEADER_RANGE)


# drop the first row assuming it is not finalized
df = pd.DataFrame(data, columns = header[0])
try:
    if(df.loc[0, 'canto_coding_id'] == 'song_id'):
        df = df.drop([0])
except:
    print("no error")
    
df.to_csv('./data/downloaded_data.csv')

no error


## Load Metadata

In [141]:
metadata = pd.read_csv('./data/metadata.csv')

## Convert Data

In [320]:
# Load conversion guide
with open('./data/conversion_guide.json') as f:
  conversion_guide = json.load(f)

df = pd.read_csv('./data/downloaded_data.csv')

df2 = df.drop(columns=['orv_1', 'orv_2', 'ensemble_value_id',
       'ensemble_value_label', 'instrument_value_id',
       'instrument_value_label','Unnamed: 0'])

feature_cols = ["line_"+str(i+1) for i in range(37)]

In [297]:
def get_metadata(soc_id):
    for i,row in metadata.iterrows():
        if int(row["society_id"]) == int(soc_id):
            return row["Local_latitude"], row["Local_longitude"]

In [321]:
def convert(input_matrix, i):
    for col, row in input_matrix.items():
        if col in feature_cols:
            for item in conversion_guide[col]:
                if int(item["code"])==int(row[i]):
                    soc_id = input_matrix.loc[i, "society_id"]
                    df.loc[i,col] = item["display_code"]

In [323]:
%%notify -m "conversion completed"
# this will take about an hour. Will have to refactor later to optimize
for i in range(len(df2)):
    convert(df2[i:(i+1)],i)

<IPython.core.display.Javascript object>

In [324]:
df.to_csv("./data/full_cantometrics.csv").drop(columns=['Unnamed: 0'])

# Prepare Data

In [40]:
columns = [
           'soc_id', 
           'canto_coding_id', 
           'var_id', 
           'code', 
           'year', 
           'publisher', 
           'publication_collection', 
           'repository', 
           'sources']

long_form = pd.DataFrame(columns = columns)

# Split it up into smaller chunks

In [41]:
df_1 = df[:1000]
df_2 = df[1000:2000]
df_3 = df[2000:3000]
df_4 = df[3000:4000]
df_5 = df[4000:]

# Matrix Conversions Function

In [42]:
def convert_matrix(input_matrix):
    index = 0
    year = None
    publisher = None
    publication_collection = None
    repository = None
    source = None
    output_matrix = pd.DataFrame(columns = columns)
    for i, row in input_matrix.iterrows():
        canto_coding_id = row['canto_coding_id']
        soc_id = row['C_cid']
        metadata_row = metadata[metadata['C-id']==canto_coding_id]
        if metadata_row['Year'].values.size > 0:
            year = metadata_row['Year'].values[0]
        if metadata_row['Publisher'].values.size > 0:
            publisher = metadata_row['Publisher'].values[0]
        if metadata_row['Publcation_collection'].values.size > 0:
            publication_collection = metadata_row['Publcation_collection'].values[0]
        if metadata_row['Repository'].values.size > 0:
            repository = metadata_row['Repository'].values[0]
        if metadata_row['Sources'].values.size > 0:
            sources = metadata_row['Sources'].values[0]
        
        for j in range(37):
            var_id = j+1
            code = row['cv_'+str(var_id)]
            index +=1
            new_row = pd.DataFrame([
                [
                 soc_id,
                 int(canto_coding_id),
                 int(var_id),
                 int(code),
                 year,
                 publisher,
                 publication_collection,
                 repository,
                 sources
                ]], 
                columns=columns)
            
            output_matrix = output_matrix.append(new_row)
            
    return output_matrix

# Execute each chunk one by one

In [43]:
%%notify -m "section 1 completed"
long_form = long_form.append(convert_matrix(df_1))

<IPython.core.display.Javascript object>

In [44]:
%%notify -m "section 2 completed"
long_form = long_form.append(convert_matrix(df_2))

<IPython.core.display.Javascript object>

In [45]:
%%notify -m "section 3 completed"
long_form = long_form.append(convert_matrix(df_3))

<IPython.core.display.Javascript object>

In [46]:
%%notify -m "section 4 completed"
long_form = long_form.append(convert_matrix(df_4))

<IPython.core.display.Javascript object>

In [47]:
%%notify -m "section 5 completed"
long_form = long_form.append(convert_matrix(df_5))
long_form.to_csv('./output/long_form.csv')

<IPython.core.display.Javascript object>