In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import csv
from collections import Counter 
from collections import *
import math
%load_ext jupyternotify

The jupyternotify extension is already loaded. To reload it, use:
  %reload_ext jupyternotify


# Import Data from Google Sheets}

In [27]:
from __future__ import print_function
import pickle
import os.path
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request

In [28]:
SCOPES = ['https://www.googleapis.com/auth/spreadsheets.readonly']

# Reference: https://developers.google.com/sheets/api/quickstart/python
def read_google_sheets(SPREADSHEET_ID, RANGE_NAME, HEADER_RANGE):
    creds = None
    # autogenerated
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)

    service = build('sheets', 'v4', credentials=creds)

    sheet = service.spreadsheets()
    result = sheet.values().get(spreadsheetId=SPREADSHEET_ID,
                                range=RANGE_NAME).execute()
    
    header = sheet.values().get(spreadsheetId=SPREADSHEET_ID,
                               range=HEADER_RANGE).execute()
    
    header_values = header.get('values', [])
    values = result.get('values', [])
    
    return values, header_values

In [29]:
SPREADSHEET_ID = '1AjynK9mMQTw58B_B8b_ZIip3fyUm-aoV7Pp21HziBb0'
RANGE_NAME = 'canto_codings!A2:AT'
HEADER_RANGE = 'canto_codings!A1:AT1'

data, header = read_google_sheets(SPREADSHEET_ID, RANGE_NAME, HEADER_RANGE)

df = pd.DataFrame(data, columns = header[0])

In [30]:
df.head()

Unnamed: 0,canto_coding_id,Culture,C_cid,cv_1,cv_2,cv_3,cv_4,cv_5,cv_6,cv_7,...,cv_34,cv_35,cv_36,cv_37,orv_1,orv_2,ensemble_value_id,ensemble_value_label,instrument_value_id,instrument_value_label
0,1,South Jakarta,17557,4,512,64,16,2,2,9216,...,16,16,128,16,1,6,1.0,All singers play instruments,6.0,5 kinds of instruments
1,2,Balinese,11507,4,2,2,16,2,2,2,...,2,2,128,16,1,0,,,,
2,3,Balinese,11507,512,8,64,128,16,128,8192,...,16,16,2,128,7,3,1.0,All singers play instruments,3.0,2 kinds of instruments
3,4,Balinese,11507,4096,4096,4096,128,128,8192,8192,...,16,2,1024,16,7,3,1.0,All singers play instruments,1.0,No instruments
4,5,Sama,62459,4,256,4,16,2,2,16,...,16,16,128,16,1,1,,,1.0,No instruments


# Load Metadata

In [31]:
metadata = pd.read_csv('./data/metadata.csv')

In [32]:
metadata.head()

Unnamed: 0,C-id,Local_lat,Local_long,Region,Division,Subregion,Area/Kingdom,Culture,People,Culture_loc,...,Living_metadata,Recorded_by,Year,Publisher,Publcation_collection,Repository,Classification_notes,Metadata_notes,Sources,Source Tag
0,458,-0.48,15.89,Africa,Central Africa,Equatorial Central Africa,"Cuvette Dept, C Congo",Kouyou,W Equatorial Bantu,"Fort-Rousset, Ouando, Republic of the Congo",...,,"Gilbert Rouget, André Didier, Musée de l'Homme...",1946,Smithsonian Folkways,Music of Equatorial Africa. Folkways Records F...,Ralph Rinzler Folklife Archives and Collection...,,,Introduction by Harold Courlander; notes by Gi...,@FW1950MusicOfEquatorialAfrica
1,459,-0.48,15.89,Africa,Central Africa,Equatorial Central Africa,"Cuvette Dept, C Congo",Kouyou,W Equatorial Bantu,"Fort-Rousset, Ouando, Republic of the Congo",...,,"Gilbert Rouget, André Didier, Musée de l'Homme...",1946,Smithsonian Folkways,Music of Equatorial Africa. Folkways Records F...,Ralph Rinzler Folklife Archives and Collection...,,,Introduction by Harold Courlander; notes by Gi...,@FW1950MusicOfEquatorialAfrica
2,9228,-0.48,15.89,Africa,Central Africa,Equatorial Central Africa,"Cuvette Dept, C Congo",Kouyou,W Equatorial Bantu,"Fort-Rousset, Ouando, Republic of the Congo",...,,"Gilbert Rouget, André Didier, Musée de l'Homme...",1946,Columbia Records,Columbia World Library of Folk & Primitive Mus...,Sound Archives of the CNRS & the Musée de l'Ho...,,,Edited by Andre Schaeffner and Gilbert Rouget,@CFPM1955FrenchAfrica
3,2137,-0.74,18.11,Africa,Central Africa,Equatorial Central Africa,"Equateur Prov, N W DR Congo",Ekonda Mongo,W Equatorial Bantu,"Southeastern Shore of Lake Tumba, Bikoro Terri...",...,,Alan P. Merriam; Barbara Merriam,1952,Washington Records,Ekonda: Tribal Music of the Congo. Washington ...,"Copy in Alan Lomax Collection, American Folkli...",,Merriam's notes list rattle and scraper accomp...,,@Washington1962EkondaTribalMusic
4,2138,-0.74,18.11,Africa,Central Africa,Equatorial Central Africa,"Equateur Prov, N W DR Congo",Ekonda Mongo,W Equatorial Bantu,"Southeastern Shore of Lake Tumba, Bikoro Terri...",...,,Alan P. Merriam; Barbara Merriam,1952,Washington Records,Ekonda: Tribal Music of the Congo. Washington ...,"Copy in Alan Lomax Collection, American Folkli...",,,,@Washington1962EkondaTribalMusic


# Prepare Data

In [97]:
columns = [
           'soc_id', 
           'canto_coding_id', 
           'var_id', 
           'code', 
           'year', 
           'publisher', 
           'publication_collection', 
           'repository', 
           'sources']

long_form = pd.DataFrame(columns = columns)

# Split it up into smaller chunks

In [98]:
df_1 = df[:1000]
df_2 = df[1000:2000]
df_3 = df[2000:3000]
df_4 = df[3000:4000]
df_5 = df[4000:]

# Matrix Conversions Function

In [99]:
def convert_matrix(input_matrix):
    index = 0
    year = None
    publisher = None
    publication_collection = None
    repository = None
    source = None
    output_matrix = pd.DataFrame(columns = columns)
    for i, row in input_matrix.iterrows():
        canto_coding_id = row['canto_coding_id']
        soc_id = row['C_cid']
        metadata_row = metadata[metadata['C-id']==canto_coding_id]
        if metadata_row['Year'].values.size > 0:
            year = metadata_row['Year'].values[0]
        if metadata_row['Publisher'].values.size > 0:
            publisher = metadata_row['Publisher'].values[0]
        if metadata_row['Publcation_collection'].values.size > 0:
            publication_collection = metadata_row['Publcation_collection'].values[0]
        if metadata_row['Repository'].values.size > 0:
            repository = metadata_row['Repository'].values[0]
        if metadata_row['Sources'].values.size > 0:
            sources = metadata_row['Sources'].values[0]
        
        for j in range(37):
            var_id = j+1
            code = row['cv_'+str(var_id)]
            index +=1
            new_row = pd.DataFrame([
                [
                 soc_id,
                 int(canto_coding_id),
                 int(var_id),
                 int(code),
                 year,
                 publisher,
                 publication_collection,
                 repository,
                 sources
                ]], 
                columns=columns)
            
            output_matrix = output_matrix.append(new_row)
            
    return output_matrix

# Execute each chunk one by one

In [100]:
%%notify -m "section 1 completed"
long_form = long_form.append(convert_matrix(df_1))

In [101]:
%%notify -m "section 2 completed"
long_form = long_form.append(convert_matrix(df_2))

In [102]:
%%notify -m "section 3 completed"
long_form = long_form.append(convert_matrix(df_3))

In [103]:
%%notify -m "section 4 completed"
long_form = long_form.append(convert_matrix(df_4))

In [104]:
%%notify -m "section 5 completed"
long_form = long_form.append(convert_matrix(df_5))

# Save to disk

In [105]:
long_form.to_csv('./output/long_form.csv')