### This code downloads XML bulks of patent BIBLIOGRPAHIC data from USPTO, parses it into a dataframe and creates a flag for patents supporting working-from-home technologies.

In [3]:
#Uploading necessary packages
import pandas as pd
import pandas_read_xml as pdx
import numpy as np
import re
import os

In [6]:
#Specifying the directory where all weekly txt raw patent files are stored
for j in range(2005,2021):
    print("Processing year {}".format(j))
    
    directory = '/Users/yulia_zhestkova/Patent parsing/'+str(j)+'/'
    os.chdir('/Users/yulia_zhestkova/Patent parsing/'+str(j)+'/')

#Going through all the txt files and splitting them into individual patent txt pieces
    i=0
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            file=open(file=filename, mode='r')
            file_content_raw=file.read()
            file.close()
            text1=re.compile("<\?xml version=\"1\.0\" encoding\=\"UTF\-8\"\?>") #identifier of a new patent in XML
            file_content=text1.split(file_content_raw)
            while '' in file_content:
                file_content.remove('')
    
            print("Processing file {}".format(filename)) 

#Formatting the list of individual patent documents into a pandas dataframe (table)
            df=pd.DataFrame(file_content, columns=['application_full'])
#Getting rid of formating tags
            df['application_full'].replace(to_replace=r'\n', value='', regex=True, inplace=True)
            df['application_full'].replace(to_replace=r'\"', value='', regex=True, inplace=True)
#Extracting the part of the file with the bibliographic information
            df['date_published']=df['application_full'].apply(lambda x: re.search(r'date-publ=(.*)><us-bibliographic-data-application', x).group(1))
            df['info']=df['application_full'].apply(lambda x: re.search(r'<application-reference(.*)</application-reference>', x).group(1))
#Extracting title and abstract
            df['title']=df['application_full'].apply(lambda x: "" if re.search(r'<invention-title(.*)</invention-title>', x)==None else re.search(r'<invention-title(.*)</invention-title>', x).group(1))
            df['abstract']=df['application_full'].apply(lambda x: "" if re.search('<abstract(.*)</abstract>', x)==None else re.search('<abstract(.*)</abstract>', x).group(1))
#Extracting assignee and CPC class information
            df['assignee']=df['application_full'].apply(lambda x: "" if re.search('<orgname(.*)</orgname>', x)==None else re.search('<orgname(.*)</orgname>', x).group(1))
            df['main_class']=df['application_full'].apply(lambda x: "" if re.search('<classifications-cpc><main-cpc><classification-cpc>(.*)</classification-cpc></main-cpc>', x)==None else re.search('<classifications-cpc><main-cpc><classification-cpc>(.*)</classification-cpc></main-cpc>', x).group(1))
            df=df.drop(['application_full'], axis=1)
#Cleaning out and identyfing exact parts with application id, application filing date, country, title and abstract
            df['app_id']=df['info'].apply(lambda x: "" if re.search(r'<doc-number>(.*)</doc-number>', x)==None else re.search(r'<doc-number>(.*)</doc-number>', x).group(1))
            df['date']=df['info'].apply(lambda x: "" if re.search(r'<date>(.*)</date>', x)==None else re.search(r'<date>(.*)</date>', x).group(1))
            df['country']=df['info'].apply(lambda x: "" if re.search(r'<country>(.*)</country>', x)==None else re.search(r'<country>(.*)</country>', x).group(1))
            df['title']=df['title'].apply(lambda x: "" if re.search(r'id=\w{5,6}>(.*)', x)==None else re.search(r'id=\w{5,6}>(.*)', x).group(1))
            df['abstract']=df['abstract'].apply(lambda x: "" if re.search(r'id=abstract><p id=p-0001 num=0000>(.*)</p>', x)==None else re.search(r'id=abstract><p id=p-0001 num=0000>(.*)</p>', x).group(1))
            df=df[['title', 'abstract', 'assignee', 'main_class', 'app_id', 'date','country', 'date_published']]
#Cleaning XML class tree to extract section, class, subclass, group, subgroup        
            df['section']=df['main_class'].apply(lambda x: "" if re.search(r'<section>(.*)</section>', x)==None else re.search(r'<section>(.*)</section>', x).group(1))
            df['class']=df['main_class'].apply(lambda x: "" if re.search(r'<class>(.*)</class>', x)==None else re.search(r'<class>(.*)</class>', x).group(1))
            df['subclass']=df['main_class'].apply(lambda x: "" if re.search(r'<subclass>(.*)</subclass>', x)==None else re.search(r'<subclass>(.*)</subclass>', x).group(1))
            df['group']=df['main_class'].apply(lambda x: "" if re.search(r'<main-group>(.*)</main-group>', x)==None else re.search(r'<main-group>(.*)</main-group>', x).group(1))
            df['subgroup']=df['main_class'].apply(lambda x: "" if re.search(r'<subgroup>(.*)</subgroup>', x)==None else re.search(r'<subgroup>(.*)</subgroup>', x).group(1))
            df=df.drop(['main_class'], axis=1)
#Cleaning up assignee name a bit -- still extremely noisy with many strange symbols. Not ready for use as is and require more cleaning if needed
            df['assignee_clean']=df['assignee'].apply(lambda x: "" if re.search(r'>(.*)</orgname>', x)==None else re.search(r'>(.*)</orgname>', x).group(1))
            df['assignee_clean']=np.where(df['assignee_clean']!="", df['assignee_clean'], df['assignee'])
            df['assignee_clean'].replace(to_replace=r'>', value='', regex=True, inplace=True)
            df['assignee_clean_final']=df['assignee_clean'].apply(lambda x: "" if re.search('(.*?)</orgname', x)==None else re.search('(.*?)</orgname', x).group(1))
            df['assignee_clean_final']=np.where(df['assignee_clean_final']=="", df['assignee_clean'], df['assignee_clean_final'])
            df=df.drop(['assignee', 'assignee_clean'], axis=1)
#First 2 digits of application id is series code, other 6 numbers are series id.
            df['series_code']=df['app_id'].astype(str).str[:2]
            df['series_id']=df['app_id'].astype(str).str[2:]
#Cleaning up the text of the patent that we are going to work with (title+abstract)
            df['text']=df['abstract'].apply(lambda x: re.sub(r'[^A-Za-z0-9 ]+', '', x))
            df['text']=df['title']+" "+df['abstract']
            df['text']=df['text'].str.lower() #all lower case
            df['text']=df['text'].apply(lambda x: re.sub(r'[^A-Za-z ]+', '', x)) #leave only letters
            df=df.rename(columns={'assignee_clean_final':'assignee'})
#Cleaning up assignee names a bit:
            df['assignee']=df['assignee'].str.lower()
            df['assignee'] = df['assignee'].str.replace('&amp',' ')
            df['assignee'] = df['assignee'].str.replace('&apos',' ')
            df['assignee'] = df['assignee'].str.replace('&#x26',' ')
            df['assignee'] = df['assignee'].str.replace('&#xf8',' ')
            df['assignee'] = df['assignee'].str.replace('&#xd3',' ')
            df['assignee'] = df['assignee'].str.replace('&#xc4',' ')
            df['assignee'] = df['assignee'].str.replace('&#xdc',' ')
            df['assignee'] = df['assignee'].str.replace('&#xc9',' ')
            df['assignee'] = df['assignee'].str.replace('&#xe9',' ')
            df['assignee'] = df['assignee'].str.replace('&#xe0',' ')
            df['assignee'] = df['assignee'].str.replace('&#xcd',' ')
            df['assignee'] = df['assignee'].str.replace('&#xf3',' ')
            df['assignee'] = df['assignee'].str.replace('&#xd6',' ')
            df['assignee'] = df['assignee'].str.replace('&#xf6',' ')
            df['assignee'] = df['assignee'].str.replace('&#xc0',' ')
            df['assignee'] = df['assignee'].str.replace('&#xc1',' ')
            df['assignee'] = df['assignee'].str.replace('&#xc8',' ')  
            df['assignee'] = df['assignee'].str.replace('&#xd6',' ')           
            df['assignee'] = df['assignee'].str.replace('&#xe7',' ')
            df['assignee'] = df['assignee'].str.replace('&#xd2',' ')
            df['assignee'] = df['assignee'].str.replace('&#xe2',' ')
            df['assignee'] = df['assignee'].str.replace('&#xda',' ')
            df['assignee'] = df['assignee'].str.replace('&#xf6',' ')
            df['assignee'] = df['assignee'].str.replace('&#xc2',' ')
            df['assignee'] = df['assignee'].str.replace('&#xd4',' ')  
            df['assignee'] = df['assignee'].str.replace('&#xed',' ')
            df['assignee'] = df['assignee'].str.replace('&#xfc',' ')
            df['assignee'] = df['assignee'].str.replace('&#xc7',' ')
            df['assignee'] = df['assignee'].str.replace('&#xc3',' ')
            df['assignee'] = df['assignee'].str.replace('&#xe8',' ')
            df['assignee'] = df['assignee'].str.replace('&#xe4',' ')
            df['assignee'] = df['assignee'].str.replace('&#xed',' ')
            df['assignee'] = df['assignee'].str.replace('&#xf2',' ')          
            df['assignee'] = df['assignee'].str.replace(' ltd',' ')
            df['assignee'] = df['assignee'].str.replace(' incorporated',' ')
            df['assignee'] = df['assignee'].str.replace(' inc.',' ')
            df['assignee'] = df['assignee'].str.replace(' gmbh',' ')
            df['assignee'] = df['assignee'].str.replace(' ag.',' ')
            df['assignee'] = df['assignee'].str.replace(' kg.',' ')
            df['assignee'] = df['assignee'].str.replace(' llc',' ')
            df['assignee'] = df['assignee'].str.replace(' limited',' ')
            df['assignee'] = df['assignee'].str.replace('s.p.a.',' ')
            df['assignee'] = df['assignee'].str.replace(' corporation',' ')
            df['assignee'] = df['assignee'].str.replace(' group',' ')
            df['assignee'] = df['assignee'].str.replace(' corp.',' ')
            df['assignee'] = df['assignee'].str.replace(' co.',' ')
            df['assignee'] = df['assignee'].str.replace('co. kg',' ')
            df['assignee'] = df['assignee'].str.replace('(publ)',' ')
            df['assignee'] = df['assignee'].str.replace('s.a.s.',' ')
            df['assignee'] = df['assignee'].str.replace('s.r.o.',' ')       
            df['assignee'] = df['assignee'].str.replace('s.r.l.',' ')
            df['assignee'] = df['assignee'].str.replace('k.k.',' ')
            df['assignee'] = df['assignee'].str.replace('b.v.',' ')
            df['assignee'] = df['assignee'].str.replace('s.a.r.l.',' ')
            df['assignee'] = df['assignee'].str.replace('a.s.',' ')
            df['assignee'] = df['assignee'].str.replace('v.v.i.',' ')
            df['assignee'] = df['assignee'].str.replace(' company',' ')
            df=df[['date_published','text', 'title', 'assignee', 'date', 'country', 'section', 'class', 'subclass', 'group', 'series_code', 'series_id', 'app_id']]

#Combining all files for one year in one csv file
            if i==0:
                master_df=df
            else:
                master_df=master_df.append(df, ignore_index=True)
    
            i=i+1
            del df
        
    master_df.to_csv (r'complete'+str(j)+'.csv', index = None)

del master_df

Processing year 2005
Processing file ipab20050407.txt
Processing file ipab20050203.txt
Processing file ipab20050217.txt
Processing file ipab20050414.txt
Processing file ipab20050428.txt
Processing file ipab20050616.txt
Processing file ipab20050602.txt
Processing file ipab20050825.txt
Processing file ipab20051124.txt
Processing file ipab20050818.txt
Processing file ipab20050210.txt
Processing file ipab20050303.txt
Processing file ipab20050317.txt
Processing file ipab20050707.txt
Processing file ipab20050908.txt
Processing file ipab20050505.txt
Processing file ipab20051222.txt
Processing file ipab20051020.txt
Processing file ipab20051208.txt
Processing file ipab20050512.txt
Processing file ipab20050922.txt
Processing file ipab20051027.txt
Processing file ipab20050113.txt
Processing file ipab20050728.txt
Processing file ipab20050714.txt
Processing file ipab20050106.txt
Processing file ipab20050310.txt
Processing file ipab20050120.txt
Processing file ipab20050915.txt
Processing file ipab20

Processing file ipab20090129.txt
Processing file ipab20090101.txt
Processing file ipab20090115.txt
Processing file ipab20090507.txt
Processing file ipab20091022.txt
Processing file ipab20091119.txt
Processing file ipab20090212.txt
Processing file ipab20090827.txt
Processing file ipab20090416.txt
Processing file ipab20090402.txt
Processing file ipab20091126.txt
Processing file ipab20090205.txt
Processing file ipab20090611.txt
Processing file ipab20090604.txt
Processing file ipab20090820.txt
Processing year 2010
Processing file ipab20100225.txt
Processing file ipab20100812.txt
Processing file ipab20100422.txt
Processing file ipab20100218.txt
Processing file ipab20101104.txt
Processing file ipab20100805.txt
Processing file ipab20100408.txt
Processing file ipab20101111.txt
Processing file ipab20100624.txt
Processing file ipab20100520.txt
Processing file ipab20100722.txt
Processing file ipab20101007.txt
Processing file ipab20100325.txt
Processing file ipab20100318.txt
Processing file ipab20

Processing file ipab20140102.txt
Processing file ipab20140710.txt
Processing file ipab20141023.txt
Processing file ipab20141009.txt
Processing file ipab20140327.txt
Processing file ipab20140904.txt
Processing file ipab20140911.txt
Processing file ipab20140508.txt
Processing file ipab20140130.txt
Processing file ipab20141204.txt
Processing file ipab20140522.txt
Processing file ipab20141211.txt
Processing file ipab20140123.txt
Processing file ipab20140731.txt
Processing file ipab20140724.txt
Processing file ipab20141002.txt
Processing file ipab20141016.txt
Processing file ipab20140320.txt
Processing file ipab20140109.txt
Processing file ipab20140807.txt
Processing file ipab20141106.txt
Processing file ipab20141113.txt
Processing file ipab20140227.txt
Processing file ipab20140626.txt
Processing file ipab20140424.txt
Processing file ipab20140814.txt
Processing file ipab20140828.txt
Processing file ipab20140619.txt
Processing file ipab20140220.txt
Processing year 2015
Processing file ipab20

Processing file ipab20190418.txt
Processing file ipab20190801.txt
Processing file ipab20190815.txt
Processing file ipab20190829.txt
Processing file ipab20191205.txt
Processing file ipab20190523.txt
Processing file ipab20190912.txt
Processing file ipab20191212.txt
Processing file ipab20190124.txt
Processing file ipab20191010.txt
Processing file ipab20190905.txt
Processing file ipab20190509.txt
Processing file ipab20190131.txt
Processing file ipab20190530.txt
Processing file ipab20190822_2.txt
Processing file ipab20190718.txt
Processing file ipab20190725.txt
Processing file ipab20190822_3.txt
Processing file ipab20191003.txt
Processing file ipab20191017.txt
Processing file ipab20190321.txt
Processing file ipab20190502.txt
Processing file ipab20190516.txt
Processing file ipab20190926.txt
Processing file ipab20191219.txt
Processing file ipab20191031.txt
Processing file ipab20190307.txt
Processing file ipab20190919.txt
Processing file ipab20190808_2.txt
Processing file ipab20191024.txt
Proc

In [8]:
#Files for 2002-2004 have a bit different XML version and require a different parsing code:

for j in range(2002,2005):
    print("Processing year {}".format(j))
    
    directory = '/Users/yulia_zhestkova/Patent parsing/'+str(j)+'/'
    os.chdir('/Users/yulia_zhestkova/Patent parsing/'+str(j)+'/')

#Going through all the txt files and splitting them into individual patent txt pieces
    i=0
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            file=open(file=filename, mode='r')
            file_content_raw=file.read()
            file.close()
            text1=re.compile("<\?xml version=\"1\.0\" encoding\=\"UTF\-8\"\?>")
            file_content=text1.split(file_content_raw)
            while '' in file_content:
                file_content.remove('')
    
            print("Processing file {}".format(filename)) 
            df=pd.DataFrame(file_content, columns=['application_full'])
            df['application_full'].replace(to_replace=r'\n', value='', regex=True, inplace=True)
            df['application_full'].replace(to_replace=r'\"', value='', regex=True, inplace=True)
            df['date_published']=df['application_full'].apply(lambda x: re.search(r'<document-date>(.*)</document-date>', x).group(1))
            df['info']=df['application_full'].apply(lambda x: re.search(r'<domestic-filing-data>(.*)</domestic-filing-data>', x).group(1))
            df['title']=df['application_full'].apply(lambda x: "" if re.search(r'<title-of-invention>(.*)</title-of-invention>', x)==None else re.search(r'<title-of-invention>(.*)</title-of-invention>', x).group(1))
            df['abstract']=df['application_full'].apply(lambda x: "" if re.search('<subdoc-abstract>(.*)</subdoc-abstract>', x)==None else re.search('<subdoc-abstract>(.*)</subdoc-abstract>', x).group(1))
            df['assignee']=df['application_full'].apply(lambda x: "" if re.search('<organization-name>(.*)</organization-name>', x)==None else re.search('<organization-name>(.*)</organization-name>', x).group(1))
            df=df.drop(['application_full'], axis=1)
            df['app_id']=df['info'].apply(lambda x: "" if re.search(r'<doc-number>(.*)</doc-number>', x)==None else re.search(r'<doc-number>(.*)</doc-number>', x).group(1))
            df['date']=df['info'].apply(lambda x: "" if re.search(r'<filing-date>(.*)</filing-date>', x)==None else re.search(r'<filing-date>(.*)</filing-date>', x).group(1))
            df['abstract']=df['abstract'].apply(lambda x: "" if re.search(r'<paragraph id="A-0001" lvl="0">(.*)</paragraph>', x)==None else re.search(r'<paragraph id="A-0001" lvl="0">(.*)</paragraph>', x).group(1))
            df=df[['title', 'abstract', 'assignee', 'app_id', 'date', 'date_published']]
            df['date_published']=df['date_published'].str[:8]
            df['assignee_clean']=df['assignee'].apply(lambda x: "" if re.search(r'>(.*)</organization-name>', x)==None else re.search(r'>(.*)</organization-name>', x).group(1))
            df['assignee_clean']=np.where(df['assignee_clean']!="", df['assignee_clean'], df['assignee'])
            df['assignee_clean'].replace(to_replace=r'>', value='', regex=True, inplace=True)
            df['assignee_clean_final']=df['assignee_clean'].apply(lambda x: "" if re.search('(.*?)</organization-name', x)==None else re.search('(.*?)</organization-name', x).group(1))
            df['assignee_clean_final']=np.where(df['assignee_clean_final']=="", df['assignee_clean'], df['assignee_clean_final'])
            df=df.drop(['assignee', 'assignee_clean'], axis=1)
            df['series_code']=df['app_id'].astype(str).str[:2]
            df['series_id']=df['app_id'].astype(str).str[2:]
            df['text']=df['abstract'].apply(lambda x: re.sub(r'[^A-Za-z0-9 ]+', '', x))
            df['text']=df['title']+" "+df['abstract']
            df['text']=df['text'].str.lower()
            df['text']=df['text'].apply(lambda x: re.sub(r'[^A-Za-z ]+', '', x))
            df=df.rename(columns={'assignee_clean_final':'assignee'})
            df['assignee']=df['assignee'].str.lower()
            df['assignee'] = df['assignee'].str.replace('&amp',' ')
            df['assignee'] = df['assignee'].str.replace('&apos',' ')
            df['assignee'] = df['assignee'].str.replace('&#x26',' ')
            df['assignee'] = df['assignee'].str.replace('&#xf8',' ')
            df['assignee'] = df['assignee'].str.replace('&#xd3',' ')
            df['assignee'] = df['assignee'].str.replace('&#xc4',' ')
            df['assignee'] = df['assignee'].str.replace('&#xdc',' ')
            df['assignee'] = df['assignee'].str.replace('&#xc9',' ')
            df['assignee'] = df['assignee'].str.replace('&#xe9',' ')
            df['assignee'] = df['assignee'].str.replace('&#xe0',' ')
            df['assignee'] = df['assignee'].str.replace('&#xcd',' ')
            df['assignee'] = df['assignee'].str.replace('&#xf3',' ')
            df['assignee'] = df['assignee'].str.replace('&#xd6',' ')
            df['assignee'] = df['assignee'].str.replace('&#xf6',' ')
            df['assignee'] = df['assignee'].str.replace('&#xc0',' ')
            df['assignee'] = df['assignee'].str.replace('&#xc1',' ')
            df['assignee'] = df['assignee'].str.replace('&#xc8',' ')  
            df['assignee'] = df['assignee'].str.replace('&#xd6',' ')           
            df['assignee'] = df['assignee'].str.replace('&#xe7',' ')
            df['assignee'] = df['assignee'].str.replace('&#xd2',' ')
            df['assignee'] = df['assignee'].str.replace('&#xe2',' ')
            df['assignee'] = df['assignee'].str.replace('&#xda',' ')
            df['assignee'] = df['assignee'].str.replace('&#xf6',' ')
            df['assignee'] = df['assignee'].str.replace('&#xc2',' ')
            df['assignee'] = df['assignee'].str.replace('&#xd4',' ')  
            df['assignee'] = df['assignee'].str.replace('&#xed',' ')
            df['assignee'] = df['assignee'].str.replace('&#xfc',' ')
            df['assignee'] = df['assignee'].str.replace('&#xc7',' ')
            df['assignee'] = df['assignee'].str.replace('&#xc3',' ')
            df['assignee'] = df['assignee'].str.replace('&#xe8',' ')
            df['assignee'] = df['assignee'].str.replace('&#xe4',' ')
            df['assignee'] = df['assignee'].str.replace('&#xed',' ')
            df['assignee'] = df['assignee'].str.replace('&#xf2',' ')          
            df['assignee'] = df['assignee'].str.replace(' ltd',' ')
            df['assignee'] = df['assignee'].str.replace(' incorporated',' ')
            df['assignee'] = df['assignee'].str.replace(' inc.',' ')
            df['assignee'] = df['assignee'].str.replace(' gmbh',' ')
            df['assignee'] = df['assignee'].str.replace(' ag.',' ')
            df['assignee'] = df['assignee'].str.replace(' kg.',' ')
            df['assignee'] = df['assignee'].str.replace(' llc',' ')
            df['assignee'] = df['assignee'].str.replace(' limited',' ')
            df['assignee'] = df['assignee'].str.replace('s.p.a.',' ')
            df['assignee'] = df['assignee'].str.replace(' corporation',' ')
            df['assignee'] = df['assignee'].str.replace(' group',' ')
            df['assignee'] = df['assignee'].str.replace(' corp.',' ')
            df['assignee'] = df['assignee'].str.replace(' co.',' ')
            df['assignee'] = df['assignee'].str.replace('co. kg',' ')
            df['assignee'] = df['assignee'].str.replace('(publ)',' ')
            df['assignee'] = df['assignee'].str.replace('s.a.s.',' ')
            df['assignee'] = df['assignee'].str.replace('s.r.o.',' ')       
            df['assignee'] = df['assignee'].str.replace('s.r.l.',' ')
            df['assignee'] = df['assignee'].str.replace('k.k.',' ')
            df['assignee'] = df['assignee'].str.replace('b.v.',' ')
            df['assignee'] = df['assignee'].str.replace('s.a.r.l.',' ')
            df['assignee'] = df['assignee'].str.replace('a.s.',' ')
            df['assignee'] = df['assignee'].str.replace('v.v.i.',' ')
            df['assignee'] = df['assignee'].str.replace(' company',' ')
            df=df[['date_published','assignee','text', 'title', 'date', 'series_code', 'series_id', 'app_id']]

            if i==0:
                master_df=df
            else:
                master_df=master_df.append(df, ignore_index=True)
    
            i=i+1
            del df
        
    master_df.to_csv(r'complete'+str(j)+'.csv', index = None)




Processing year 2002
Processing file pab20020620.txt
Processing file pab20021107.txt
Processing file pab20020418.txt
Processing file pab20020801.txt
Processing file pab20020815.txt
Processing file pab20020829.txt
Processing file pab20021114.txt
Processing file pab20021128.txt
Processing file pab20020627.txt
Processing file pab20020425.txt
Processing file pab20020221.txt
Processing file pab20020131.txt
Processing file pab20020509.txt
Processing file pab20020905.txt
Processing file pab20021212.txt
Processing file pab20020124.txt
Processing file pab20021010.txt
Processing file pab20020523.txt
Processing file pab20020912.txt
Processing file pab20021205.txt
Processing file pab20021003.txt
Processing file pab20021017.txt
Processing file pab20020321.txt
Processing file pab20020725.txt
Processing file pab20020718.txt
Processing file pab20020530.txt
Processing file pab20021024.txt
Processing file pab20020110.txt
Processing file pab20021226.txt
Processing file pab20020919.txt
Processing file pab

In [4]:
#Loading a dictionary of words associated with working from home
wfh=['telecommuting', 'telework', 'teleworking', 'working from home', 'mobile work', 'remote work', 'flexible workplace', 'work from home', 'mobile working', 'remote working', 'work remotely', 'working remotely', 'remote workplace', 'telecommuter', 'teleworker', 'home-sourced worker', 'home-sourced employee', 'work-at-home', 'work at home', 'telecommuting specialist', 'nomadic worker', 'nomadic employee', 'work-from-home', 'work-from-anywhere', 'video conference', 'video conferencing', 'virtual office', 'distance work', 'flexible work', 'virtual work', 'virtual office', 'virtual employee', 'home office', 'home-based office', 'home-based work', 'work from anywhere', 'working from anywhere', 'work-from-anywhere', 'digital workplace', 'video chat', 'video call', 'teleconference', 'teleconferencing', 'working from a remote location', 'work from a remote location']

#Redefining files directory
directory = '/Users/yulia_zhestkova/Patent parsing/'
os.chdir('/Users/yulia_zhestkova/Patent parsing/')

#Processing complete csv files (output of the previous code) 
for j in range(2010,2021):
    print("Processing file {}".format(j))
    
    df = pd.read_csv(str(j)+'/complete'+str(j)+'.csv')
#Looking for the word combinations from the dictionary and replacing them with WFH flag 
    df['text']=df['text'].replace({w: "wfhflag" for w in wfh}, regex=True)
#Count how many times WFH flag occurs in a patent abstract and title
    df['count flags']=df['text'].str.count("wfhflag")
    df=df[['date', 'app_id', 'count flags', 'assignee', 'date_published']]
    
    if j==2010:
        df_master=df
    else:
        df_master=df_master.append(df, ignore_index=True)
        
df_master.to_csv (r'wfhflag.csv', index = None)

Processing file 2010
Processing file 2011
Processing file 2012
Processing file 2013
Processing file 2014
Processing file 2015
Processing file 2016
Processing file 2017
Processing file 2018
Processing file 2019
Processing file 2020
