In [1]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from collections import *
import math
from __future__ import print_function
import pickle
import os.path
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request

# Get Data from Google Sheets

In [2]:
SCOPES = ['https://www.googleapis.com/auth/spreadsheets.readonly']

# Reference: https://developers.google.com/sheets/api/quickstart/python
def read_google_sheets(SPREADSHEET_ID, RANGE_NAME, HEADER_RANGE):
    creds = None
    # autogenerated
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)

    service = build('sheets', 'v4', credentials=creds)

    sheet = service.spreadsheets()
    result = sheet.values().get(spreadsheetId=SPREADSHEET_ID,
                                range=RANGE_NAME).execute()

    header = sheet.values().get(spreadsheetId=SPREADSHEET_ID,
                               range=HEADER_RANGE).execute()

    header_values = header.get('values', [])
    values = result.get('values', [])

    return values, header_values

In [3]:
SPREADSHEET_ID = '1AjynK9mMQTw58B_B8b_ZIip3fyUm-aoV7Pp21HziBb0'
RANGE_NAME = 'canto_codings!A2:AT'
HEADER_RANGE = 'canto_codings!A1:AT1'

data, header = read_google_sheets(SPREADSHEET_ID, RANGE_NAME, HEADER_RANGE)

canto_data = pd.DataFrame(data, columns = header[0])

In [6]:
features = ['cv_'+str(i+1)for i in range(37)]
cultures = canto_data['Culture'].unique()
columns = ['culture', 'soc_id']
for feature in features:
    columns.append(feature)

In [18]:
final_df = pd.DataFrame(columns = columns)
for culture in cultures:
    modal_profile = dict(canto_data[canto_data['Culture']==culture][features].mode().loc[0])
    modal_profile['culture'] = culture
    soc_id = canto_data[canto_data['Culture']==culture]['C_cid'].reset_index()['C_cid'][0]
    modal_profile['soc_id'] = soc_id
    final_df = final_df.append(modal_profile, ignore_index=True)

In [21]:
final_df.to_csv('././output/modal_profiles.csv')

In [20]:
final_df

Unnamed: 0,culture,soc_id,cv_1,cv_2,cv_3,cv_4,cv_5,cv_6,cv_7,cv_8,...,cv_28,cv_29,cv_30,cv_31,cv_32,cv_33,cv_34,cv_35,cv_36,cv_37
0,South Jakarta,17557,4,256,32,16,2,2,8192,128,...,32,128,2,2,1024,2,2,1024,128,2
1,Balinese,11507,4,2,2,16,2,2,2,2,...,2,128,2,2,1024,8,2,16,128,16
2,Sama,62459,4,256,4,16,2,2,16,2,...,8192,128,2,2,128,64,16,16,128,16
3,Bajau Laut,11397,4,2,2,16,2,2,16,2,...,32,128,2,2,128,256,16,128,1024,16
4,Dayak,14862,128,2,2,1024,1024,16,16,2,...,32,2,16,8192,1024,64,128,128,1024,1024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1012,Makatao,62520,64,2,2,128,128,128,2,2,...,32,128,16,8192,1026,8,128,128,128,1024
1013,Taivoan,62521,32,2,2,1024,128,16,2,2,...,32,128,128,8192,128,8,16,128,1024,1024
1014,Pazehhe,62518,64,2,2,128,1024,1024,2,2,...,512,128,2,8192,128,64,16,8192,128,16
1015,Pingpu Unspecified,30183,32,2,2,1024,128,16,2,2,...,32,128,128,8192,128,64,128,1024,128,1024
