In [2]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
files = ['train.csv', 'test.csv', 'sample_submission.csv']
possible_roots = ['/kaggle/input/', './', '../../patent_phrase_similarity/resources']
def find_root_path(_possible_roots):
    for _root_path in _possible_roots:
        for dirname, _, filenames in os.walk(_root_path):
            if set(files).issubset(filenames):
                if dirname:
                    root_dirname = os.path.join(_root_path, dirname)
                else:
                    root_dirname = _root_path

                return os.path.abspath(root_dirname)

    raise Exception(f"Could not find a good path, check your curdir {os.path.abspath(os.curdir)}")
root_path = find_root_path(possible_roots)

train_filepath = os.path.join(root_path, 'train.csv')
test_filepath = os.path.join(root_path, 'train.csv')
sample_filepath = os.path.join(root_path, 'sample_submission.csv')
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

root_path

'D:\\workspace\\kaggle\\patent_phrase_similarity\\resources'

In this dataset, you are presented pairs of phrases (an `anchor` and a `target` phrase) and asked to rate how similar they are on a scale from 0 (not at all similar) to 1 (identical in meaning). This challenge differs from a standard semantic similarity task in that similarity has been scored here within a patent's `context`, specifically its [CPC classification (version 2021.05)](https://en.wikipedia.org/wiki/Cooperative_Patent_Classification), which indicates the subject to which the patent relates. For example, while the phrases "bird" and "Cape Cod" may have low semantic similarity in normal language, the likeness of their meaning is much closer if considered in the context of "house".

This is a code competition, in which you will submit code that will be run against an unseen test set. The unseen test set contains approximately 12k pairs of phrases. A small public test set has been provided for testing purposes, but is not used in scoring.

Information on the meaning of CPC codes may be found on the [USPTO website](https://www.uspto.gov/web/patents/classification/cpc/html/cpc.html). The CPC version 2021.05 can be found on the [CPC archive website](https://www.cooperativepatentclassification.org/Archive).

## Score meanings
The scores are in the 0-1 range with increments of 0.25 with the following meanings:

- 1.0 - Very close match. This is typically an exact match except possibly for differences in conjugation, quantity (e.g. singular vs. plural), and addition or removal of stopwords (e.g. “the”, “and”, “or”).
- 0.75 - Close synonym, e.g. “mobile phone” vs. “cellphone”. This also includes abbreviations, e.g. "TCP" -> "transmission control protocol".
- 0.5 - Synonyms which don’t have the same meaning (same function, same properties). This includes broad-narrow (hyponym) and narrow-broad (hypernym) matches.
- 0.25 - Somewhat related, e.g. the two phrases are in the same high level domain but are not synonyms. This also includes antonyms.
- 0.0 - Unrelated.
## Files
- train.csv - the training set, containing phrases, contexts, and their similarity scores
- test.csv - the test set set, identical in structure to the training set but without the score
- sample_submission.csv - a sample submission file in the correct format
## Columns
- id - a unique identifier for a pair of phrases
- anchor - the first phrase
- target - the second phrase
- context - [the CPC classification (version 2021.05)](https://en.wikipedia.org/wiki/Cooperative_Patent_Classification), which indicates the subject within which the similarity is to be scored
- score - the similarity. This is sourced from a combination of one or more manual expert ratings.
> "Google Patent Phrase Similarity Dataset" by Google is licensed under a Creative Commons Attribution 4.0 International License (CC BY 4.0)

In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

pd.set_option('display.max_colwidth', None)

In [6]:
train_df = pd.read_csv(train_filepath)
test_df = pd.read_csv(test_filepath)
sample_df = pd.read_csv(sample_filepath)

In [7]:
train_df.context[train_df.context == "A62"]
train_df.iloc[14280]

id         06dc4a2081a1cfe6
anchor             gas leak
target                fault
context                 A62
score                   0.5
Name: 14280, dtype: object

In [8]:
train_df.context.unique()

array(['A47', 'A61', 'A62', 'C01', 'F16', 'F24', 'F28', 'H01', 'H04',
       'B23', 'B41', 'D03', 'E03', 'C08', 'D01', 'D21', 'C07', 'A45',
       'B01', 'B08', 'G04', 'G06', 'B65', 'G16', 'G01', 'A41', 'C23',
       'F23', 'B25', 'A63', 'B28', 'B63', 'F04', 'B60', 'B32', 'C09',
       'C02', 'G03', 'C10', 'B61', 'C21', 'F42', 'A23', 'C11', 'B29',
       'F02', 'B62', 'B64', 'E21', 'B24', 'B22', 'H05', 'B27', 'E04',
       'B21', 'D06', 'C04', 'B05', 'G02', 'H03', 'C06', 'G11', 'C12',
       'E02', 'F15', 'A46', 'B66', 'G07', 'G08', 'C22', 'B44', 'A01',
       'F03', 'C25', 'F22', 'G05', 'G21', 'B07', 'F41', 'E01', 'H02',
       'C13', 'F01', 'F27', 'C14', 'A44', 'B67', 'A24', 'B02', 'E05',
       'D05', 'F25', 'A43', 'A22', 'A21', 'E06', 'F21', 'G10', 'C03',
       'B81', 'F17', 'B03', 'G09', 'D04', 'F26', 'B31'], dtype=object)

In [9]:
train_df

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.50
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.50
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.00
...,...,...,...,...,...
36468,8e1386cbefd7f245,wood article,wooden article,B44,1.00
36469,42d9e032d1cd3242,wood article,wooden box,B44,0.50
36470,208654ccb9e14fa3,wood article,wooden handle,B44,0.50
36471,756ec035e694722b,wood article,wooden material,B44,0.75


# CPC datasets

For example "A01B33/00"

- A: Section A
- 01: Class 
- B: Subclass
- 33: Group
- 00: Main group

Section
- A: Human Necessities
- B: Operations and Transport
- C: Chemistry and Metallurgy
- D: Textiles
- E: Fixed Constructions
- F: Mechanical Engineering
- G: Physics
- H: Electricity
- Y: Emerging Cross-Sectional Technologies

In [None]:
from xml.etree import ElementTree as ET
cpc_scheme_A = ET.parse('./CPCSchemeXML202105/cpc-scheme-A.xml')
cpc_scheme_A_root = cpc_scheme_A.getroot()

In [None]:
from xml.etree.ElementTree import Element
elem: Element = list(cpc_scheme_A_root)[0]
dict([(cpc_scheme_A_root.tag, list(cpc_scheme_A_root))])

In [None]:
cpc_scheme_xml_map = {
    'class-scheme': {
        'classification-item': {
            '__name__': 'section',
            '__key__': 'classification-symbol',
            'class-title': {
                'title-part' : {
                    '__name__': 'section',
                    '__value__': 'text'
                }
            },
            'classification-item': {
                '__name__': 'class',
                '__key__': 'classification-symbol',
                'class-title': {
                    'title-part' : {
                        '__name__': 'class',
                        '__value__': 'text'
                    }
                },
                'classification-item' : {
                    '__name__': 'class',
                    '__key__': 'classification-symbol',
                    'class-title': {
                        'title-part' : {
                            '__name__': 'class',
                            '__value__': 'text'
                            # reference.text reference.CPC-specific-text.text
                        }
                    },
                    'classification-item' : {
                        '__name__': 'subclass',
                        '__key__': 'classification-symbol',
                        'class-title': {
                            'title-part' : {
                                '__name__': 'subclass',
                                '__value__': 'text'
                                # reference.text reference.CPC-specific-text.text
                            }
                        }
                    }
                }
            }
        }
    }
}


# Iterar sobre cada clave, y asignar los valores en otro diccionario anidado, despues se aplanara
def get_data_from_xml_with(xml_mapper, xml_root_tag, plain_xml_data=None, plain_key=""):
    if plain_xml_data is None:
        plain_xml_data = {}

    xml_root_tag = dict([('__elem__', xml_root_tag), (xml_root_tag.tag, list(xml_root_tag))])
    
    tag_key = xml_mapper['__key__'] if '__key__' in xml_mapper else ""
    tag_value = xml_mapper['__value__'] if '__value__' in xml_mapper else ""
    data_name = xml_mapper['__name__'] if '__name__' in xml_mapper else ""

    if tag_key in xml_root_tag:
        if data_name not in plain_xml_data:
            plain_xml_data[data_name] = {}
        plain_key = xml_root_tag['__elem__'].text
        plain_xml_data[data_name][xml_root_tag['__elem__'].text] = []

    if tag_value in xml_root_tag:
        plain_xml_data[data_name][plain_key] += [xml_root_tag['__elem__'].text]

    for tag, elems in xml_root_tag.items():
        if tag in xml_mapper:
            for elem in elems:
                plain_xml_data, plain_key = get_data_from_xml_with(xml_mapper[tag], elem, plain_xml_data=plain_xml_data, plain_key=plain_key)
            
    return plain_xml_data, plain_key

In [None]:
def items_not_in_list(_left, _right, how="left"):
        _list = _left if how=="left" else _right
        _other = _right if how=="left" else _left
        _res = []
        for _item in _list:
            if _item not in _other:
                _res.append(_item)

        return _res

def combine_dicts(*_dicts, how="left"):
    """
    >>> CognitionFunctions.combine_dicts({'claveA': [1,2,3,4]}, {'claveA': [5,6,7,8]})
    {'claveA': [1, 2, 3, 4, 5, 6, 7, 8]}
    >>> CognitionFunctions.combine_dicts({'claveA': []}, {'claveB': [5,6,7,8]})
    {'claveA': [], 'claveB': [5, 6, 7, 8]}


    :param _dicts:
    :param how:
    :return:
    """
    _left_dict = {} if len(_dicts) == 0 else _dicts[0]
    if not _left_dict or len(_dicts) <= 1:
        return _left_dict

    _right_dict = _dicts[1]
    _other_dicts = _dicts[2:]

    for key in _left_dict:
        if key in _right_dict:
            # COMBINAR VALOR RECURSIVAMENTE
            # AMBOS SON LISTAS
            _left = _left_dict[key]
            _right = _right_dict[key]
            _res = None
            if isinstance(_left, (tuple, list)) and isinstance(_right, (tuple, list)):
                _type = type(_right)
                if how == "left":
                    _type = type(_left)

                if _type == list:
                    _res = [*_left, *_right]
                elif _type == tuple:
                    _res = (*_left, *_right)

            # AMBOS SON DICCIONARIOS
            elif isinstance(_left, dict) and isinstance(_right, dict):
                _res = combine_dicts(_left, _right, how=how)
            # CADA UNO TIENE UN TIPO DIFERENTE (SE COGE EL VALOR DEPENDIENDO DEL TIPO DE MERGE)
            if how != "left":
                _res = _right

            _left_dict[key] = _res

    # Se combinan las claves del derecho que no esten en el izquierdo agregandolas
    no_in_left_dict = items_not_in_list(
        list(_right_dict.keys()), list(_left_dict.keys()),
        how=how if how == "left" else "right"
    )
    for key in no_in_left_dict:
        _left_dict[key] = _right_dict[key]

    # Se continua combiando el resto de diccionarios
    if len(_other_dicts) > 0:
        _left_dict = combine_dicts(_left_dict, *_other_dicts)

    return _left_dict


combine_dicts({'a': {'b': [1,3,4]}}, {'a': {'b': [5]}})

In [None]:
import re
import os
import xml.etree.ElementTree as ET

cpc_data = {}
cpc_data_merged = {}
# cpc_path = "/kaggle/input/cpc-scheme-cml-202105/CPCSchemeXML202105/"
cpc_path = "./CPCSchemeXML202105/"

dirname, _, filenames = list(os.walk(cpc_path))[0]
for filename in filenames:
    if re.match(r"cpc-scheme-[a-z]\.xml", filename, re.IGNORECASE):
        print(filename)
        cpc_scheme = ET.parse(os.path.join(dirname, filename))
        cpc_scheme_root = cpc_scheme.getroot()
        section = re.sub(r"cpc-scheme-([a-z])\.xml", r"\1", filename, flags=re.IGNORECASE)
        cpc_data[section], _ = get_data_from_xml_with(cpc_scheme_xml_map, cpc_scheme_root)
        # cpc_data_merged = combine_dicts(cpc_data_merged, cpc_data[section])

cpc_data_merged = combine_dicts(*list(cpc_data.values()))

In [None]:
print(cpc_data_merged.keys())

In [None]:
cpc_data_merged['section']

In [None]:
{k: cpc_data_merged['section'][k] for k in sorted(cpc_data_merged['section'])}

In [None]:
from collections import defaultdict
cpc_list = []

for name in cpc_data_merged:
    # Se ordena
    cpc_data_merged[name] = {k: cpc_data_merged[name][k] for k in sorted(cpc_data_merged[name])}

# Al ser un conjuntos agregados es decir una subclass tiene una class y una section
# Se puede rellenar la subclass y todos los atributos de class y subclass
for code, subclass_values in cpc_data_merged['subclass'].items():
    _cpc_dict = defaultdict(tuple)

    _section = code[0]
    _class = code[1:-1]
    _subclass = code[-1]
    
    _cpc_dict["sect_class"] = _section+_class
    _cpc_dict["section"] = _section
    _cpc_dict["class"] = _class
    _cpc_dict["subclass"] = _subclass
    
    # Se rellena Section
    _aux_section_values = tuple([])
    for v in cpc_data_merged['section'][_section]:
        _aux_section_values = _aux_section_values + tuple([v])

    # Se rellena Class
    _aux_class_values = tuple([])
    for v in cpc_data_merged['class'][f'{_section}{_class}']:
        _aux_class_values = _aux_class_values + tuple([v])
    
    # Se rellena Subclass
    _aux_subclass_values = tuple([])
    for v in subclass_values:
        _aux_subclass_values = _aux_subclass_values + tuple([v])
        
    
    _cpc_dict[f'section_text'] = _aux_section_values
    _cpc_dict[f'class_text'] = _aux_class_values
    _cpc_dict[f'subclass_text'] = _aux_subclass_values
    
    cpc_list.append(_cpc_dict)

In [None]:
cpc_df = pd.DataFrame(cpc_list, columns=['sect_class', 'section', 'class', 'subclass', 'section_text', 'class_text', 'subclass_text'])

In [None]:
cpc_df.iloc[0:9]

In [None]:
cpc_df.describe()

In [None]:
cpc_df.to_csv("./cpc_202105.csv", index=False)

In [None]:
cpc_df = pd.read_csv("./cpc_202105.csv")
cpc_df['class'] = cpc_df['sect_class'].apply(lambda x: x[1:])

In [None]:
cpc_df.describe()

In [None]:
cpc_df['section_text'] = cpc_df['section_text'].apply(eval)
cpc_df['class_text'] = cpc_df['class_text'].apply(eval)
cpc_df['subclass_text'] = cpc_df['subclass_text'].apply(eval)
cpc_df = cpc_df.rename({'sect_class': 'context'}, axis=1)
cpc_df_grouped = cpc_df.groupby('context').agg(tuple)

for col in cpc_df_grouped.columns:
    if 'subclass' not in col:
        cpc_df_grouped[col] = cpc_df_grouped[col].apply(lambda row: row[0])
        
cpc_df = cpc_df_grouped.reset_index()
cpc_df['subclass_text'] = cpc_df['subclass_text'].apply(lambda val: tuple([v for l in val for v in l]))

# Merge

In [None]:
cpc_train_df = train_df.merge(cpc_df, on=['context'])

In [None]:
cpc_train_df.dtypes

TRANSFORMS

In [None]:
import re
def count_regex(pattern, raw):
    return len(re.findall(pattern, raw))

cpc_train_df['section_cat'] = cpc_train_df['section'].apply(lambda x: ord(x) - ord('A'))
cpc_train_df['class_cat'] = cpc_train_df['class'].astype(int)
cpc_train_df['context_cat'] = cpc_train_df['context'].apply(lambda x: (ord(x[0]) - ord('A')) * 1000 + int(x[1:]))

# VISUALIZE

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")
sns.set(font_scale=1.3)

In [None]:
sns.FacetGrid(cpc_train_df, hue="score", height=5, aspect=4).map(sns.kdeplot, "context_cat").add_legend()
sns.FacetGrid(cpc_train_df, hue="score", height=5, aspect=4).map(sns.kdeplot, "section_cat").add_legend()

In [None]:
pd.value_counts(cpc_train_df['score'].values, sort=True)

In [None]:
cpc_train_df.corr()["score"].abs().sort_values(ascending= False)

# WORD EMBEDDINGS
[Guide Reference](https://medium.com/@adriensieg/text-similarities-da019229c894)

- Jaccard Similarity ☹☹☹
- Different embeddings+ K-means ☹☹
- Different embeddings+ Cosine Similarity ☹
- Word2Vec + Smooth Inverse Frequency + Cosine Similarity 😊
- Different embeddings+LSI + Cosine Similarity ☹
- Different embeddings+ LDA + Jensen-Shannon distance 😊
- Different embeddings+ Word Mover Distance 😊😊
- Different embeddings+ Variational Auto Encoder (VAE) 😊 😊
- Different embeddings+ Universal sentence encoder 😊😊
- Different embeddings+ Siamese Manhattan LSTM 😊😊😊
- BERT embeddings + Cosine Similarity ❤
- Knowledge-based Measures ❤

![Cosine Distance](./imgs/1_5J8YlnfnZlzFobQC9cGk-w.png)

## Bag of Words (BoW)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

## Term Frequency - Inverse Document Frequency (TF - IDF)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

## Continuous BoW (CBoW) model and SkipGram Model embedding (SkipGram)

## Pre Trained - Word2Vec (by Google)

## Pre Trained - GloVe (by Standford)

## Pre Trained - fastText (by Facebook)

## Poincarré embedding


## Node2Vec embedding based on Random Walk and Graph


## BERT Embeddings

# MODELS

## LDA with Jensen-Shannon distance

In [None]:
def jensen_shannon(query, matrix):
    """
    This function implements a Jensen-Shannon similarity
    between the input query (an LDA topic distribution for a document)
    and the entire corpus of topic distributions.
    It returns an array of length M where M is the number of documents in the corpus
    """
    # lets keep with the p,q notation above
    p = query[None,:].T # take transpose
    q = matrix.T # transpose matrix
    m = 0.5*(p + q)
    return np.sqrt(0.5*(entropy(p,m) + entropy(q,m)))

def get_most_similar_documents(query,matrix,k=10):
    """
    This function implements the Jensen-Shannon distance above
    and retruns the top k indices of the smallest jensen shannon distances
    """
    sims = jensen_shannon(query,matrix) # list of jensen shannon distances
    return sims.argsort()[:k] # the top k positional index of the smallest Jensen Shannon distances



## WORDNET

In [None]:
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic

dog=wn.synsets('dog', pos=wn.NOUN)[0] #get the first noun synonym of the word "dog"
cat=wn.synsets('cat', pos=wn.NOUN)[0]
rose=wn.synsets('rose', pos=wn.NOUN)[0]
flower=wn.synsets('flower', pos=wn.NOUN)[0]

brown_ic = wordnet_ic.ic('ic-brown.dat') #load the brown corpus to compute the IC

print(rose.res_similarity(flower, brown_ic))
print(rose.res_similarity(dog, brown_ic))
print(cat.res_similarity(dog, brown_ic))

![image.png](./imgs/1_6Gg14KMHGDFNWrW8oB6QZw.png)