In [1]:
import tabula
import re
import pandas as pd
import os
from oauth2client.service_account import ServiceAccountCredentials
import gspread

In [5]:
# Change pdf name
df = tabula.read_pdf("20201126 Waltham Forest Privated Public Register November 2020.pdf"
                     , pages='all'
                     , pandas_options = {'header': None})

In [6]:
len(df) #should total the amount of pages in the document

76

In [7]:
class SplitFields:
    def __init__(self, field):
        self.field = field
    
    @property
    def split_list(self):
        return re.split(r'([A-Z]{1,2}[0-9R][0-9A-Z]? [0-9][A-Z]{2})', self.field)
    
    @property
    def property_address(self):
        return ''.join(self.split_list[:len(self.split_list)-1])
    
    @property
    def licence_holder(self):
        return self.split_list[-1]
        

In [8]:
complete_register = pd.concat(df, ignore_index=True)

# remove first line which contains header data
complete_register.drop([0], inplace=True)

# reset index after dropping row
complete_register.reset_index(drop=True, inplace=True)

# get property address from field
complete_register[5] = complete_register[1].apply(lambda x: SplitFields(x).property_address)

# get licence holder from field
complete_register[6] = complete_register.apply(lambda x: SplitFields(x[1]).licence_holder if pd.isnull(x[2]) else x[2], axis=1)

In [9]:
# created a df with columns to keep
final_df = complete_register[[0,5,6,3,4]].copy()

In [10]:
# rename columns
final_df.columns = ['ref_no', 'property_address', 'licence_holder', 'start_date', 'end_date']

In [11]:
final_df.head()

Unnamed: 0,ref_no,property_address,licence_holder,start_date,end_date
0,14531,"43 Chester Road, Walthamstow, E17 7HP",Mr Lahrie Mohamed,15/03/2018,15/03/2023
1,20502,"Flat 1, 8 Goldsmith Road, Leyton, E10 5HA",Mr Lahrie Mohamed,12/05/2017,12/05/2022
2,20881,"Flat 2, 79 Park Road, Leyton, E10 7BZ",Mr Lahrie Mohamed,01/11/2018,01/11/2023
3,20932,"Ground Floor Flat, 23 Cornwallis Road, Waltham...",Mr Lahrie Mohamed,26/10/2018,26/10/2023
4,21254,"Flat 3, 62 St James Street, Walthamstow, E17 7PE",Mr Lahrie Mohamed,26/10/2018,26/10/2023


In [9]:
len(final_df)

2332

In [3]:
# file path of google credentials
filepath_cred = os.path.dirname(os.path.dirname(os.getcwd()))+'/ignore_me/service_account.json'

# use creds to create a client to interact with the Google Drive API
scope = ['https://www.googleapis.com/auth/spreadsheets', 'https://www.googleapis.com/auth/drive']
creds = ServiceAccountCredentials.from_json_keyfile_name(filepath_cred, scope)
client = gspread.authorize(creds)

# Find a workbook by name and open the first sheet
# Make sure you use the right name here.
sheet = client.open("hmo_registers").sheet1

# Extract and print all of the values
# list_of_hashes = sheet.get_all_records()
# print(list_of_hashes)

[{'1': 'once ', '2': 'i', '3': 'ate'}, {'1': 'a', '2': 'fish', '3': 'alive'}]


In [14]:
# final_df.columns.values.tolist()

In [15]:
# final_df.values.tolist()

In [16]:
final_df.columns.values.tolist() + final_df.values.tolist()

['ref_no',
 'property_address',
 'licence_holder',
 'start_date',
 'end_date',
 ['14531',
  '43 Chester Road, Walthamstow, E17 7HP',
  ' Mr Lahrie Mohamed',
  '15/03/2018',
  '15/03/2023'],
 ['20502',
  'Flat 1, 8 Goldsmith Road, Leyton, E10 5HA',
  ' Mr Lahrie Mohamed',
  '12/05/2017',
  '12/05/2022'],
 ['20881',
  'Flat 2, 79 Park Road, Leyton, E10 7BZ',
  ' Mr Lahrie Mohamed',
  '01/11/2018',
  '01/11/2023'],
 ['20932',
  'Ground Floor Flat, 23 Cornwallis Road, Walthamstow, E17 6NL',
  ' Mr Lahrie Mohamed',
  '26/10/2018',
  '26/10/2023'],
 ['21254',
  'Flat 3, 62 St James Street, Walthamstow, E17 7PE',
  ' Mr Lahrie Mohamed',
  '26/10/2018',
  '26/10/2023'],
 ['21484',
  'Flat 1, 165 Sinclair Road, Chingford, E4 8PP',
  ' Mr Moshe Newman',
  '22/06/2016',
  '22/06/2021'],
 ['22965',
  '24 Mayfield Road, Walthamstow, E17 5RH',
  ' Mr David Mitchell',
  '04/01/2016',
  '04/01/2021'],
 ['24877',
  'Ground Floor Flat, 174 Blackhorse Lane, Walthamstow, E17 6AA',
  ' Mr Mohammed Farooq A

In [21]:
from gspread_pandas import Spread

file_name = "http://stats.idre.ucla.edu/stat/data/binary.csv"
df = pd.read_csv(file_name)

# 'Example Spreadsheet' needs to already exist and your user must have access to it
spread = Spread('hmo_registers', creds=ServiceAccountCredentials.from_json_keyfile_name(filepath_cred, scope))
# This will ask to authenticate if you haven't done so before

# Display available worksheets
spread.sheets

# Save DataFrame to worksheet 'New Test Sheet', create it first if it doesn't exist
spread.df_to_sheet(df, index=False, sheet='New Test Sheet', start='A2', replace=True)
spread.update_cells('A1', 'A1', ['Created by:'])

In [None]:
# sheet.update([dataframe.columns.values.tolist()] + dataframe.values.tolist())
