In [38]:
# Set up paths to your data
from pathlib import Path
data_dir = str(Path('').parent / 'analysis')
paper_info_filename = 'cvpr_paper_info_and_keywords.pkl'
paper_extra_info_filename = 'cvpr_papers_to_all_patents'
codes_filename = 'codes_keywords_filtered.txt'
countries_filename = 'country-codes.txt'
elite_univs_filename = 'elite_universities.txt'

codes_of_interest = ['surveil', 'hard_crime',
                     'body_parts', 'bodies',
                     'demographic', 'children',
                     'scenes', 
                     #'significant_traces', 
                     'hard_movement', 'soft_influence', 
                    ]

# Data setup

In [22]:
'''Imports'''
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from pandas import DataFrame
import os
from itertools import chain
import plotly.graph_objects as go
import plotly
import plotly.express as px
import numpy as np
import pickle
from plotly.subplots import make_subplots
import plotly.io as pio; pio.renderers.default = "iframe"
from os.path import join
from IPython.display import display
import requests
from bs4 import BeautifulSoup
import pickle
import re

np.random.seed(1991)

In [3]:
'''Make output directories if necessary'''
(figs_dir := Path(data_dir) / '..' / 'figures').mkdir(exist_ok=True)
(patent_dir := Path(data_dir) / 'patents').mkdir(exist_ok=True)

In [4]:
'''Helpers to handle paper and patent data'''
def read_lists(filename, filter_comments=True):
    '''Read a file containing codes in our special format and return a dictionary of the codes'''
    with open(filename) as f:
        d = {section.split('\n')[0].strip('## '): [item for item in section.strip().split('\n')[1:] if item]  # code to keywords
             for section in f.read().split('\n\n')}  # for each code's section
    if filter_comments:
        d = {key: [item.split('#')[0].strip() for item in d[key] if not item.startswith('#')] for key in d}  # keep real part of valid lines
        for key in list(d):
            if d[key] == []:  # delete codes with no items
                del d[key]
    return d

def write_lists(lists, filename):
    '''Write a file containing codes in our special format.'''
    split = '\n'
    s = "\n\n".join([f'## {header}\n{split.join([item for item in lists[header]])}' for header in lists])
    with open(filename, 'w') as f:
        f.write(s)
    
def load_papers():
    '''Load all papers and info into a Pandas DataFrame'''
    papers = (pd.read_pickle(os.path.join(data_dir, paper_info_filename))
              .rename_axis('PaperId')  # Rename index
              .rename(dict(DisplayName='Institution', DisplayName_fields='Field'), axis=1))  # Rename some columns
    # Load papers' patents
    papers_patents = pickle.load(open(os.path.join(data_dir, paper_extra_info_filename), 'rb'))
    papers_patents = DataFrame(papers_patents.items(), columns=['PaperId',  'patents']).set_index('PaperId')  # Convert to Dataframe
    papers_patents.patents = papers_patents.patents.apply(lambda patents: [p.replace('-', '') for p in patents])
    # Combine and clean
    papers = papers.join(papers_patents)
    papers.patents = papers.patents.fillna("").apply(list)
    for col in ['PaperId', 'Latitude', 'Longitude']:  # Clean columns that say the same info multiple times
        papers[col] = papers[col].apply(lambda vals: vals[0])
    papers.Year = papers.Year.astype(int)  # Convert year column from strings to integers
    papers['Decade'] = papers.Year.apply(lambda year: 10 * (year//10))  # Save decade
    # Aggregate keyword columns into code columns
    codes = read_lists(join(data_dir, codes_filename))
    missing_keywords = [keyword for keyword in sum(codes.values(), []) if keyword not in papers.columns]
    if missing_keywords:
        print(f'No data on these keywords so they will be ignored:\n{", ".join(missing_keywords)}')
    for code, keywords in codes.items():
        papers[f'{code}_code'] = papers[[keyword for keyword in keywords if keyword in papers.columns]].sum(axis=1)
    # Add whether a paper was ever cited in a patent (either of the following two lines have the same result)
    papers['n_patents'] = papers.patents.apply(len)
    # papers['n_patents'] = papers['keyword_depth_vec'].apply(lambda cites: int(cites[0,:,0].sum()))
    return papers

def filter_paper_years(papers, year_range=range(1990, 2022), years_to_drop=[1990, 1995, 2002]):
    '''Filter papers DataFrame to requested years and return updated Pandas DataFrame. The defaults result in limiting to the years we view as reliable data.'''
    return papers.query('(Year in @year_range) & (Year not in @years_to_drop)')
    
def mark_interesting(papers, codes_of_interest):
    '''Given papers DataFrame and which codes are of interest (list of strings), add columns indicating which papers are of interest and related info'''
    papers['patents_of_interest'] = papers[codes_of_interest].sum(axis=1).apply(set).apply(list)
    papers['n_patents_of_interest'] = papers.patents_of_interest.apply(len)
    papers['n_patents_not_of_interest'] = papers.n_patents - papers.n_patents_of_interest

def analyze_sources(papers, source):
    '''Given papers DataFrame and desired source type for analysis (string, e.g. Institution, Country, Field, Year, or Decade), collect and return institutions DataFrame'''
    # Each link between a source and a paper+details (e.g. a paper affiliated with U.S. and China has two otherwise identical rows)
    if source == 'Country':  # Extra step before
        source = 'Iso3166Code'
    links = (papers[[source, 'PaperId', 'patents', 'n_patents', 'patents_of_interest', 'n_patents_of_interest']]
             .explode(source)  # each paper has multiple links
             .drop_duplicates(['PaperId', source])  # remove duplicate links
             .reset_index(drop=True))
    if source == 'Iso3166Code':  # Extra step after
        source = 'Country'
        countries = dict([line.strip().split('\t') for line in open(os.path.join(data_dir, countries_filename)).readlines()])
        links.insert(0, 'Country', links.Iso3166Code.replace(countries))
    # Summarize the sources' downstream patents
    sources_general_info = links.groupby(source).agg(dict(PaperId=list, patents=sum)).rename(dict(PaperId='papers'), axis=1)
    sources_general_info.patents = sources_general_info.patents.apply(set).apply(list)  # drop duplicate patents
    sources_patent_info = links.query('n_patents > 0').groupby(source).agg(dict(PaperId=list)).rename(dict(PaperId='papers_patented'), axis=1)
    sources_info_of_interest = links.query('n_patents_of_interest > 0').groupby(source).agg(dict(PaperId=list, patents_of_interest=sum)).rename(dict(PaperId='papers_of_interest'), axis=1)
    sources_info_of_interest.patents_of_interest = sources_info_of_interest.patents_of_interest.apply(set).apply(list)  # drop duplicate patents
    sources = sources_general_info.join([sources_patent_info, sources_info_of_interest]).reset_index()
    for col in ['papers', 'papers_patented', 'patents', 'papers_of_interest', 'patents_of_interest']:
        sources[col] = sources[col].fillna("").apply(list)
        sources[f'n_{col}'] = sources[col].apply(len)
    # Drop generic sources
    if source == 'Field':
        generic_fields = [
            'Artificial intelligence', 'Computer vision', 'Pattern recognition', 'Mathematics', 'Computer science', 
            'Machine learning', 'Deep learning', 'Algorithm', '(', 'Convolutional neural net', 'Pixel', 'Artificial neural networks']
        sources = sources[~sources.Field.apply(lambda field: any([generic_field.lower() in field.lower() for generic_field in generic_fields]))]
    # Clean
    sources = sources.replace('French Institute for Research in Computer Science and Automation', 'IRIA')
    sources = sources.replace('Korea Advanced Institute of Science and Technology', 'Korea Advanced Inst. of Science & Tech.')
    sources = sources.replace('University of Illinois Urbana-Champaign', 'Urbana-Champaign')
    sources = sources.replace('University of California, Berkeley', 'Berkeley')
    sources = sources.replace('Massachusetts Institute of Technology', 'MIT')
    sources = sources.replace('Cognitive neuroscience of visual object recognition', 'CogSci of object recognition')
    if source == 'Institution':
        sources.Institution = sources.Institution.apply(lambda name: name.replace('University', 'Univ.'))
    if source == 'Decade':
        decades_of_interest = [1990, 2010]
        sources = sources.query('Decade in @decades_of_interest')
    return sources

def filter_lists(lists_fp, constraint, flag):
    '''Add a note to list items that do not satisfy the constraint'''
    old_lists = read_lists(lists_fp, filter_comments=False)
    lists = {header: [item if (item.startswith('#') or constraint(item)) else f'# {item}  # {flag}' for item in old_lists[header]] 
             for header in old_lists}
    write_lists(lists, lists_fp)
    return read_lists(lists_fp)

def code_name(code):
    return f'{code}_code'

In [40]:
'''Load papers and codes'''
papers = filter_paper_years(load_papers())
codes = read_lists(join(data_dir, codes_filename))
keywords_of_interest = sum([codes[code] for code in codes_of_interest], [])
write_lists({'final': keywords_of_interest}, join(data_dir, 'final_keywords.txt'))
mark_interesting(papers, [code_name(code) for code in codes_of_interest])

# Drop certain keywords

In [112]:
# Note keywords that were not searched for during automatic annotation of our largescale dataset
constraint = lambda keyword: keyword in all_papers.columns
codes = filter_lists(join(data_dir, codes_filename), constraint, 'not assessed')

In [113]:
# Note which keywords never occur
constraint = lambda keyword: len(all_papers[keyword].sum()) > 10
codes = filter_lists(join(data_dir, codes_filename), constraint, 'does not occur')

# Pull example sentences for manual inspection

In [125]:
'''Helpers for getting keywords' example patents and example sentences'''
def get_patent_url(patent):
    return f"https://patents.google.com/patent/{patent}/en"
def get_patent_soup_fresh(patent):
    page = requests.get(get_patent_url(patent))  # get page from online
    return BeautifulSoup(page.content, "html.parser")
def get_patent_soup(patent, patent_dir):
    path = os.path.join(patent_dir, f'{patent}.pkl')
    if not os.path.exists(path):  # download
        pickle.dump(get_patent_soup_fresh(patent), open(path, 'wb'))
    with open(path, 'rb') as f:
        return pickle.load(f)
def get_patent_title(patent, patent_dir):
    return get_patent_soup(patent, patent_dir).find('span', dict(itemprop="title")).get_text().strip()
def get_patent_main(patent, patent_dir):
    soup = get_patent_soup(patent, patent_dir)
    divs = soup.find_all('div', {'class': 'description-paragraph'})
    return '\n\n'.join(map(str,divs))
def find_example_div(patent, s, patent_dir):
    div_start, div_end = '<div[^>]*>', '</div>'
    return re.search(f'{div_start}(.* {s} .*){div_end}', get_patent_main(patent, patent_dir), flags=re.I).groups()[0]
def find_example_sentence(text, keyword):
    non_stops = ['(\. \d)*', '(\.\d)*', '(\..\.)*']  # periods that are not stops, e.g., decimals, abbreviations, ...
    return re.search(f'([^.]*{"".join(non_stops)}[^.]* {keyword} [^.]*\.)', text, flags=re.I).groups()[0]

# Unit tests:
# patent, keyword = 'WO2014005022A1', 'apartment'  # Set these
# print(f'patents.google.com/patent/{patent}/en')
# get_patent_soup(patent, patent_dir)
# get_patent_main(patent, patent_dir)
# div = find_example_div(patent, keyword, patent_dir)
# sentence = find_example_sentence(div, keyword)
# get_patent_title(patent, patent_dir)

In [154]:
'''Find examples in patents'''
n_examples = 5  # number of example sentences desired 

codes_info = {}
for code in codes_of_interest:
    keywords_info = []
    for keyword in codes[code][:]:
        print('\n', keyword)
        keyword_patents = set(all_papers[keyword].sum())
        titles = []
        for patent in list(keyword_patents):
            print(patent, end=" ")
            url = get_patent_url(patent)
            title = get_patent_title(patent, patent_dir)
            if title not in titles:  # for one keyword, all patents should be unique
                print(title, end=" ")
                try:
                    div = find_example_div(patent, keyword, patent_dir)
                    div = re.sub('<[^>]*>', '', div)  # remove html tags inside div
                    sentence = find_example_sentence(div, keyword)
                    keywords_info.append((keyword, patent, url, title, sentence, div))
                except AttributeError:
                    print(' - failed', end=' ')
                    keywords_info.append((keyword, patent, url, title, '', ''))
                titles.append(title)
                if len(titles) == n_examples:  # check if we're done finding patents
                    break
            print()
    columns=['keyword', 'patent', 'url', 'title', 'sentence', 'section']
    codes_info[code] = pd.DataFrame(keywords_info, columns=columns)


 iris
US8050463 Iris recognition system having image quality metrics 
US10289908 Method, apparatus, and computer program product for tracking eye gaze and eye movement 
US8761458 System for iris detection, tracking and recognition at a distance 
CN104735361A Method and apparatus for acquiring a set of images illuminated by a flash 
US9504420 Methods and arrangements for identifying dermatological diagnoses with clinically negligible probabilities 
 irises
US10580133 Techniques for identifying blepharoptosis from an image 
EP1764758A1 Lane boundary recognition apparatus for vehicle  - failed 
US10311768 Virtual window 
US9965982 Near-eye light field display 
US10275648 Image processing method and system for iris recognition 
 face
US9836641 Generating numeric embeddings of images 
US10134440 Video summarization using audio and visual cues 
US10223621 Artificially intelligent systems, devices, and methods for learning and/or using visual surrounding for autonomous object operation 
US94

In [155]:
codes_info[code]

Unnamed: 0,keyword,patent,url,title,sentence,section
0,surveil,US10250809,https://patents.google.com/patent/US10250809/en,Video stabilization system and method,"For example, if a camera is mounted to survei...","In many typical applications, it is desirable ..."
1,surveil,US9330315,https://patents.google.com/patent/US9330315/en,Determining foregroundness of an object in sur...,The present invention relates generally to the...,The present invention relates generally to the...
2,surveil,US9727785,https://patents.google.com/patent/US9727785/en,Method and apparatus for tracking targets,"As one example, an unmanned aerial vehicle (U...",Sensor devices are oftentimes used to generate...
3,surveil,US10176405,https://patents.google.com/patent/US10176405/en,Vehicle re-identification techniques using neu...,This technology is useful in a variety of dif...,"Generally speaking, vehicle re-identification ..."
4,surveil,US9715639,https://patents.google.com/patent/US9715639/en,Method and apparatus for detecting targets,"As one example, an unmanned aerial vehicle (U...",Target detection may be performed in a number ...
5,surveillance,US10438055,https://patents.google.com/patent/US10438055/en,Human facial detection and recognition system,"In one particular embodiment, the at least tw...","Referring now to FIG. 10A, process 2000 begins..."
6,surveillance,WO2016159199A1,https://patents.google.com/patent/WO2016159199...,Method for re-identification of objects,This invention relates generally to computer ...,This invention relates generally to computer ...
7,surveillance,US10618673,https://patents.google.com/patent/US10618673/en,Systems and methods for dynamic planning and o...,"Since they are generally quite maneuverable, ...",Remotely-piloted small UAVs (sUAVs) are increa...
8,surveillance,WO2019052917A1,https://patents.google.com/patent/WO2019052917...,Subject identification systems and methods,Techniques described herein may also be appli...,investigation. Techniques described herein may...
9,surveillance,CN103959330A,https://patents.google.com/patent/CN103959330A/en,Systems and methods for matching visual object...,Can implement computing equipment 800 for smal...,Can implement computing equipment 800 for smal...


In [156]:
'''Save in pretty format'''
codes_tables = {}
for code in codes_info:
    table = codes_info[code]
    table.sentence = table.apply(lambda row: row.sentence.replace(row.keyword, row.keyword.upper()), axis=1)  # capitalize keyword
    table.section = table.apply(lambda row: row.section.replace(row.keyword, row.keyword.upper()), axis=1)  # capitalize keyword
    mini_tables = [pd.concat([pd.DataFrame([(keyword,'','','','','')], columns=columns), table.query(f'keyword == "{keyword}"')]) # keyword's minitable
                   for keyword in table.keyword.unique()]
    table = pd.concat(mini_tables)
    codes_tables[code] = table
    
# Write to xlsx file
with pd.ExcelWriter(join(data_dir, 'keyword_examples.xlsx')) as writer:
    for code, table in codes_tables.items():
        table.to_excel(writer, sheet_name=code)
        
# Show first
display(codes_tables[list(codes_tables)[0]])

Unnamed: 0,keyword,patent,url,title,sentence,section
0,iris,,,,,
0,iris,US8050463,https://patents.google.com/patent/US8050463/en,Iris recognition system having image quality m...,"More particularly, the invention pertains to ...",The present invention pertains to recognition ...
1,iris,US10289908,https://patents.google.com/patent/US10289908/en,"Method, apparatus, and computer program produc...",The method may further include estimating a f...,"In some embodiments, a method may be provided ..."
2,iris,US8761458,https://patents.google.com/patent/US8761458/en,"System for iris detection, tracking and recogn...","More particularly, the invention pertains to ...",The present invention pertains to recognition ...
3,iris,CN104735361A,https://patents.google.com/patent/CN104735361A/en,Method and apparatus for acquiring a set of im...,It should be understood that imaging device ca...,It should be understood that imaging device ca...
...,...,...,...,...,...,...
55,finger,US9349039,https://patents.google.com/patent/US9349039/en,Gesture recognition device and control method ...,The gesture expressed by a shape of FINGERs i...,A gesture is extracted from the image by detec...
56,finger,US9046962,https://patents.google.com/patent/US9046962/en,"Methods, systems, apparatuses, circuits and as...",(1990) gave a review of academic research on ...,Sears et al. (1990) gave a review of academic ...
57,finger,US9412003,https://patents.google.com/patent/US9412003/en,"Discriminant function specifying device, discr...","For example, the multiple feature quantities ...",It is preferable in the biometric identificati...
58,finger,US10163215,https://patents.google.com/patent/US10163215/en,Object learning and recognition method and system,"Referring to FIG. 4, when an object is a righ...","Referring to FIG. 2, when an object is a regul..."


In [135]:
'''Save in format that will look nice in google sheets'''
table = codes_info['body_parts']
table.sentence = table.apply(lambda row: row.sentence.replace(row.keyword, row.keyword.upper()), axis=1)  # capitalize keyword
table.section = table.apply(lambda row: row.section.replace(row.keyword, row.keyword.upper()), axis=1)  # capitalize keyword
mini_tables = [pd.concat([pd.DataFrame([(keyword,'','','','','')], columns=columns), table.query(f'keyword == "{keyword}"')]) # keyword's minitable
               for keyword in table.keyword.unique()]
table = pd.concat(mini_tables)
# table.loc[table.keyword.duplicated(), 'keyword'] = ''  # only note keyword once
display(table)
# table.to_csv(join(data_dir, 'keyword_examples.tsv'), sep='\t', index=False)

Unnamed: 0,keyword,patent,url,title,sentence,section
0,iris,,,,,
0,iris,US8050463,https://patents.google.com/patent/US8050463/en,Iris recognition system having image quality m...,,
1,iris,US10289908,https://patents.google.com/patent/US10289908/en,"Method, apparatus, and computer program produc...",,
2,iris,US8761458,https://patents.google.com/patent/US8761458/en,"System for iris detection, tracking and recogn...",,
3,iris,CN104735361A,https://patents.google.com/patent/CN104735361A/en,Method and apparatus for acquiring a set of im...,,
...,...,...,...,...,...,...
55,finger,US9349039,https://patents.google.com/patent/US9349039/en,Gesture recognition device and control method ...,,
56,finger,US9046962,https://patents.google.com/patent/US9046962/en,"Methods, systems, apparatuses, circuits and as...",,
57,finger,US9412003,https://patents.google.com/patent/US9412003/en,"Discriminant function specifying device, discr...",,
58,finger,US10163215,https://patents.google.com/patent/US10163215/en,Object learning and recognition method and system,,


# Show current keywords quick

In [34]:
print('\n'.join([code + ': ' + ', '.join(codes[code]) + '\n' for code in codes_of_interest]))

surveil: surveil, surveillance

hard_crime: prisoner, prison

body_parts: iris, irises, face, facial, torso, anatomy, anatomies, limb, hand, finger

bodies: pedestrian, foot traffic

demographic: ethnicity, gender, female, male, woman, man, sex

children: child, children, kid, youth, age

scenes: room, office, street, crowd, home, house, apartment, airport

hard_movement: travel, license plate, airport, baggage

soft_influence: advertisement, purchase



# Recalculate stats

In [35]:
# Quick check prevalence of norm
# We see there is a norm that for entitites authoring papers with patents - many are patented for surv. (See paper for more details.)
# Quantify how prevalent this norm is.

for source in ['Institution', 'Country', 'Field']:
    sources = analyze_sources(papers, source)
    sources['percent_of_interest'] = sources.n_papers_of_interest / sources.n_papers_patented
    n_surveillance_sources = len(sources.query("(n_papers_patented > 0) & (percent_of_interest > .5)"))
    n_relevant_sources = len(sources.query("(n_papers_patented > 0)"))
    print(f'{100 * n_surveillance_sources / n_relevant_sources:.1f}% ({n_surveillance_sources} out of {n_relevant_sources}) of patenting {source.lower()}s follow this norm.')

75.5% (608 out of 805) of patenting institutions follow this norm.
87.9% (51 out of 58) of patenting countrys follow this norm.
72.3% (3071 out of 4247) of patenting fields follow this norm.


# Prevalence of medical words

In [16]:
patents_marked_as_surv = set(papers.patents_of_interest.sum())
patents_marked_as_medical = set(papers.medical.sum())
print('Marked as surv: ', len(patents_marked_as_surv))
print('Marked as surv and not medical: ', len(patents_marked_as_surv - patents_marked_as_medical))
print('Fracion of surv that are valid: ', len(patents_marked_as_surv - patents_marked_as_medical)/len(patents_marked_as_surv))

Marked as surv:  14761
Marked as surv and not medical:  12755
Fracion of surv that are valid:  0.8641013481471445


In [18]:
papers_marked_as_surv = set(papers.query('n_patents_of_interest > 0').index)
papers_marked_as_medical = set(papers[papers.medical_code.apply(len) > 0])
print('Marked as surv: ', len(papers_marked_as_surv))
print('Marked as surv and not medical: ', len(papers_marked_as_surv - papers_marked_as_medical))
print('Fracion of surv that are valid: ', len(papers_marked_as_surv - papers_marked_as_medical)/len(papers_marked_as_surv))

Marked as surv:  4426
Marked as surv and not medical:  4426
Fracion of surv that are valid:  1.0


In [71]:
papers[['patents', 'medical_code']][papers.patents.apply(len) != 0].explode('patents').

Unnamed: 0_level_0,patents,medical_code
PaperId,Unnamed: 1_level_1,Unnamed: 2_level_1
1904325426,US10452920,[]
1904325426,US10380428,[]
1904325426,US10311913,[]
2061320885,US9808549,"[US9808549, US9808549]"
2741327137,WO2019133841A1,[WO2019133841A1]
...,...,...
2105661278,US10304254,[]
2105661278,US10083522,[]
2105661278,US10068344,[]
2118120587,US7603324,[]


In [None]:
patents = list(set(papers.patents.sum()))
patent_keywords = [for patent in patents]
for word in keywords_of_interest:
    

# See which keywords have the strongest effect

In [42]:
# For each word, check its impact on the totals
n_papers = len(papers)
n_papers_patented = len(papers.query('n_patents > 0'))
n_patents = len(set(papers.patents.sum()))
print(f'Number of papers: {n_papers}. Number of patents: {n_patents}')
word_contributions = {}
for word in keywords_of_interest:
    mark_interesting(papers, [word])
    surv_papers = papers.query("n_patents_of_interest > 0")
    surv_patents_list = surv_papers.patents_of_interest.sum()  # including repeats
    n_surv_patents = len(set(surv_patents_list)) if surv_patents_list else 0
    word_contributions[word] = 100*len(surv_papers)/n_papers_patented
    # if n_surv_patents/n_patents < .01:  # percent of patents 
    # print(f'{word}:\n {100*len(surv_papers)/n_papers_patented:2.1f}% of patented papers lead to a patent with this word. {100*n_surv_patents/n_patents:2.1f}% of patents have this word.')
    # for source in ['Institution', 'Country', 'Field']:
    #     sources = analyze_sources(all_papers, source)
    #     sources['percent_of_interest'] = sources.n_papers_of_interest / sources.n_papers_patented
    #     n_surveillance_sources = len(sources.query("(n_papers_patented > 0) & (percent_of_interest > .5)"))
    #     n_relevant_sources = len(sources.query("(n_papers_patented > 0)"))
    #     print(f'{100 * n_surveillance_sources / n_relevant_sources:.1f}% ({n_surveillance_sources} out of {n_relevant_sources}) of {source.lower()}s follow this norm.')
    # print()

Number of papers: 19551. Number of patents: 23471


In [48]:
sorted(word_contributions.items(), key=lambda word_contribution: word_contribution[1], reverse=True)

[('face', 43.53472614342179),
 ('hand', 30.980613589309243),
 ('facial', 25.898738942217204),
 ('surveillance', 19.047619047619047),
 ('room', 18.539431582909845),
 ('travel', 18.501788067005457),
 ('home', 15.847920195746283),
 ('finger', 14.624505928853756),
 ('pedestrian', 13.250517598343686),
 ('office', 12.234142668925278),
 ('street', 11.76359872012046),
 ('child', 11.537737624694147),
 ('age', 10.107284020327498),
 ('house', 10.013175230566535),
 ('advertisement', 8.300395256916996),
 ('purchase', 7.942781855825334),
 ('crowd', 6.60643704121965),
 ('torso', 6.305288913984566),
 ('man', 6.230001882175795),
 ('male', 5.853566723131941),
 ('gender', 5.740636175418784),
 ('children', 5.740636175418784),
 ('iris', 5.420666290231508),
 ('limb', 4.81837003576134),
 ('female', 4.81837003576134),
 ('license plate', 4.573687182382835),
 ('anatomy', 4.498400150574064),
 ('airport', 3.5384904950122342),
 ('woman', 1.976284584980237),
 ('sex', 1.8257105213626952),
 ('ethnicity', 1.6563146997

In [24]:
# # For each word, check its impact on the totals
# n_papers = len(all_papers)
# n_papers_patented = len(all_papers.query('n_patents > 0'))
# n_patents = len(set(all_papers.patents.sum()))
# print(f'Number of papers: {n_papers}. Number of patents: {n_patents}')
# # If you use all words
# all_words = sum([codes[code] for code in codes_of_interest], [])
# fraction_surv_papers = len(all_papers.query("n_patents_of_interest > 0")) / n_papers_patented
# # If you drop a word
# print('Dropping...')
# for _ in range(30):
#     word_impacts = []
#     for word in all_words:
#         words = all_words.copy()
#         words.remove(word)
#         mark_interesting(all_papers, words)
#         surv_papers = all_papers.query("n_patents_of_interest > 0")
#         surv_patents_list = surv_papers.patents_of_interest.sum()  # including repeats
#         n_surv_patents = len(set(surv_patents_list)) if surv_patents_list else 0
#         word_impacts.append((word, len(surv_papers)/n_papers_patented - fraction_surv_papers))
#     worst_word, drop = sorted(word_impacts, key=lambda word_drop: word_drop[1])[-1]
#     print(worst_word, drop)
#     all_words.remove(worst_word)

# all_papers = filter_paper_years(load_papers())
# mark_interesting(all_papers, all_words)
# # Quick check prevalence of norm
# # We see there is a norm that for entitites authoring papers with patents, at least half are used in surveillance. (See paper for more details.)
# # Quantify how prevalent this norm is
# for source in ['Institution', 'Country', 'Field']:
#     sources = analyze_sources(all_papers, source)
#     sources['percent_of_interest'] = sources.n_papers_of_interest / sources.n_papers_patented
#     n_surveillance_sources = len(sources.query("(n_papers_patented > 0) & (percent_of_interest > .5)"))
#     n_relevant_sources = len(sources.query("(n_papers_patented > 0)"))
#     print(f'{100 * n_surveillance_sources / n_relevant_sources:.1f}% ({n_surveillance_sources} out of {n_relevant_sources}) of {source.lower()}s follow this norm.')