In [12]:
import json
import os
import pandas as pd
from src.utils.UsefulPaths import Paths
from src.utils.SpacyUtils import SpacyUtil
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [13]:
paths = Paths()

with open(paths.json_subsectors, 'r') as file:
    subsectors = json.load(file)

In [14]:
rows_list = []

for subsetor, attributes in subsectors.items():
    row = {'subsector': subsetor}
    row.update(attributes)
    rows_list.append(row)

df = pd.DataFrame(rows_list)

df.columns = [col.lower().replace(' ', '_') for col in df.columns]

df.fillna('', inplace=True)

In [15]:
df

Unnamed: 0,subsector,definition,keywords,old_name,does_include,does_not_include
0,Artificial Intelligence; Big Data and Analytics,Artificial Intelligence companies offer produc...,"Automated intelligence, assisted intelligence,...","Artificial Intelligence, Big Data and Analytics",,
1,Advanced Manufacturing and Robotics,"In comparison to Traditional Manufacturing, Ad...","3d printing, industrial IoT, internet of thin...",Advanced Manufacturing and Robotics,"Autonomous driving, Industrial robots, Industr...",Traditional manufacturing machines
2,Clean Technology,Cleantech or clean technology is an umbrella t...,"Clean energy, and other forms of environmental...",Cleantech,,"oil and gas, petrochemicals"
3,Financial Technology,Describes a business that aims at providing fi...,"Insurance Tech, Risk Management, Trading, Port...",Fintech,,"Brick & Mortar banks, Old brick and mortar Ins..."
4,Blockchain,Companies that develop applications using bloc...,"Distributed ledgers, Digital Mining,Cryptocurr...",Blockchain,,
5,Cybersecurity,"Cybersecurity is the body of technologies, pro...","cyber security, network security, data securit...",Cybersecurity,"Application security, Information security, Ne...",
6,Agriculture Technology,Technologies to help the agriculture industry ...,,Agtech,Precision agriculture - PA is an approach to f...,"Farms, Vineyards, Coffee roasters, Beverages"
7,New Food,New Food includes technologies that can be lev...,"artificial meat, Alternative protein, Plant-ba...",New Food,,"alt- proteins based skincare products, cannabi..."
8,Advertising Technology,Advertising technology - different types of an...,"Conversion/optimization, Email marketing, Mobi...",Adtech,,Companies whose products and services are not ...
9,Blue Economy,"Blue economy is the ""sustainable use of ocean ...","ocean sustainability, aquaculture, seafloor ma...",Blue Economy,,


In [16]:
# ADD OTHERS TO SUBSECTOR DF

definition = 'Includes all patents that do not fit into the specific subsectors of Artificial Intelligence; Big Data and Analytics, Advanced Manufacturing and Robotics, Clean Technology, Financial Technology, Blockchain, Cybersecurity, Agriculture Technology, New Foods, Advertising Technology, Blue Economy, Digital Media, Gaming, Augmented Reality; Virtual Reality, Educational Technology, Industry 4.0, Biopharmaceutical; Biotechnology, Medical Technology; Medical Devices'
does_not_include = ', '.join(df['keywords'].dropna().unique())

df_others = pd.DataFrame({
    'subsector': ['others'],
    'definition': [definition],
    'keywords': [''],
    'old_name': ['others'],
    'does_include': [''],
    'does_not_include': [does_not_include]
})

df = pd.concat([df, df_others], ignore_index=True)

df_others

Unnamed: 0,subsector,definition,keywords,old_name,does_include,does_not_include
0,others,Includes all patents that do not fit into the ...,,others,,"Automated intelligence, assisted intelligence,..."


In [17]:
spacy_util = SpacyUtil(model='en_core_web_sm', lemma=True, remove_stopwords=False, lower=True, remove_numbers=False)

df['definition_preprocessed'] = df['definition'].apply(spacy_util.preprocess_text)
df['keywords_preprocessed'] = df['keywords'].apply(spacy_util.preprocess_text)
df['does_include_preprocessed'] = df['does_include'].apply(spacy_util.preprocess_text)
df['does_not_include_preprocessed'] = df['does_not_include'].apply(spacy_util.preprocess_text)

df['token_definition'] = df['definition_preprocessed'].apply(lambda x: len(str(x).split()))
df['token_keywords'] = df['keywords_preprocessed'].apply(lambda x: len(str(x).split()))
df['token_does_include'] = df['does_include_preprocessed'].apply(lambda x: len(str(x).split()))
df['token_does_not_include'] = df['does_not_include_preprocessed'].apply(lambda x: len(str(x).split()))

df

Unnamed: 0,subsector,definition,keywords,old_name,does_include,does_not_include,definition_preprocessed,keywords_preprocessed,does_include_preprocessed,does_not_include_preprocessed,token_definition,token_keywords,token_does_include,token_does_not_include
0,Artificial Intelligence; Big Data and Analytics,Artificial Intelligence companies offer produc...,"Automated intelligence, assisted intelligence,...","Artificial Intelligence, Big Data and Analytics",,,artificial intelligence company offer product ...,automated intelligence assist intelligence aug...,,,73,62,0,0
1,Advanced Manufacturing and Robotics,"In comparison to Traditional Manufacturing, Ad...","3d printing, industrial IoT, internet of thin...",Advanced Manufacturing and Robotics,"Autonomous driving, Industrial robots, Industr...",Traditional manufacturing machines,in comparison to traditional manufacturing adv...,3d printing industrial iot internet of thing...,autonomous driving industrial robot industrial...,traditional manufacturing machine,29,21,27,3
2,Clean Technology,Cleantech or clean technology is an umbrella t...,"Clean energy, and other forms of environmental...",Cleantech,,"oil and gas, petrochemicals",cleantech or clean technology be an umbrella t...,clean energy and other form of environmental a...,,oil and gas petrochemical,67,30,0,4
3,Financial Technology,Describes a business that aims at providing fi...,"Insurance Tech, Risk Management, Trading, Port...",Fintech,,"Brick & Mortar banks, Old brick and mortar Ins...",describe a business that aim at provide financ...,insurance tech risk management trading portfol...,,brick mortar bank old brick and mortar insuran...,65,21,0,9
4,Blockchain,Companies that develop applications using bloc...,"Distributed ledgers, Digital Mining,Cryptocurr...",Blockchain,,,company that develop application use blockchai...,distribute ledger digital mining cryptocurrenc...,,,52,26,0,0
5,Cybersecurity,"Cybersecurity is the body of technologies, pro...","cyber security, network security, data securit...",Cybersecurity,"Application security, Information security, Ne...",,cybersecurity be the body of technology proces...,cyber security network security data security ...,application security information security netw...,,42,15,17,0
6,Agriculture Technology,Technologies to help the agriculture industry ...,,Agtech,Precision agriculture - PA is an approach to f...,"Farms, Vineyards, Coffee roasters, Beverages",technology to help the agriculture industry to...,,precision agriculture pa be an approach to far...,farms vineyards coffee roaster beverages,84,0,62,5
7,New Food,New Food includes technologies that can be lev...,"artificial meat, Alternative protein, Plant-ba...",New Food,,"alt- proteins based skincare products, cannabi...",new food include technology that can be levera...,artificial meat alternative protein plant base...,,alt- protein base skincare product cannabis re...,63,52,0,27
8,Advertising Technology,Advertising technology - different types of an...,"Conversion/optimization, Email marketing, Mobi...",Adtech,,Companies whose products and services are not ...,advertising technology different type of analy...,conversion optimization email marketing mobile...,,company whose product and service be not focus...,50,38,0,54
9,Blue Economy,"Blue economy is the ""sustainable use of ocean ...","ocean sustainability, aquaculture, seafloor ma...",Blue Economy,,,blue economy be the sustainable use of ocean r...,ocean sustainability aquaculture seafloor mapp...,,,53,27,0,0


In [18]:
df.to_csv(os.path.join(paths.data_processed, f'subsector.csv'), index=False)