In [1]:
import nltk
import re
from nltk.corpus import stopwords
from collections import Counter
from itertools import combinations
from ast import literal_eval
import numpy as np
import pandas as pd
import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
tqdm.pandas()

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.read_csv('data/Cleaned_Patents.csv')

In [3]:
#Specifically analyzing bigrams tokens but could be other tokens as well
column = 'BigramsTokens'

In [4]:
#there are a few 'null' and 'nan' tokens which can cause problems with gephi
print(df[df[column].str.contains("'null'", na=False)].iloc[2][column])
print(df[df[column].str.contains("'null'", na=False)].iloc[2]['Abstract'])

['close', 'loop', 'resonator', 'use', 'fuzzy', 'controller', 'closed', 'loop', 'regulator', 'include', 'fuzzy_logic', 'control_unit', 'generate', 'control', 'parameter', 'produce', 'connection', 'base', 'value', 'represent', 'proportional', 'term', 'derivative', 'term', 'range', 'input', 'consider', 'seven', 'range', 'define', 'linguistic', 'term', 'relation', 'null', 'point', 'negative', 'small', 'negative', 'medium', 'negative', 'large', 'etc', 'variation', 'range', 'define', 'overlap', 'membership', 'function', 'specification', 'output', 'define', 'range', 'fuzzy', 'action', 'control', 'sum', 'integral', 'action']
A closed loop regulator includes a fuzzy logic control unit (4) that generates a control parameter (a) to produce a connection that is based upon the values (e1, e2) that represent a proportional term (e2) and a derivative term (e1).  The range of the inputs (e1, e2) are considered by seven ranges defined in linguistic terms relation to the null point; such as negative sma

In [5]:
df[column] = df[column].str.replace("'null'", "'_null_'", regex=False)
df[column] = df[column].str.replace("'nan'", "'_nan_'", regex=False)

# TF-IDF

In [8]:
tf_idf_threshold = 0.215
def process_chunk(df_chunk):

    # clean and prepare sentences
    sentences = []
    for text in df_chunk[column]:
        cleaned_text = text.replace('[','').replace(']','').replace(',','').replace("'",'')
        sentences.append(cleaned_text)

    # compute TF-IDF values
    tfidf = TfidfVectorizer()
    x = tfidf.fit_transform(sentences)
    values = x.toarray()
    feature_names = tfidf.get_feature_names_out()
    df_tfidf = pd.DataFrame(values, columns=feature_names)
    
    # Filter based on the tf idf threshold
    filtered_sentences = []
    for i in range(len(df_tfidf)):
        words_above_threshold = df_tfidf.columns[df_tfidf.loc[i] > tf_idf_threshold].tolist()
        filtered_sentences.append(words_above_threshold)

    # add the filtered words back to the temporary DataFrame
    df_chunk = df_chunk.copy()  # Avoids SettingWithCopyWarning
    df_chunk['TF-IDF_Words'] = filtered_sentences

    return df_chunk

In [9]:
df_results = pd.DataFrame()

for year in tqdm(range(1990, 2024)):
    # filter DataFrame for the current year
    df_temp = df[df['PubYear'] == year]
    
    #needed because of memory limitations
    if year >= 2019:
        half_size = len(df_temp) // 2

        df_chunk_1 = df_temp.iloc[:half_size]
        df_results = pd.concat([df_results, process_chunk(df_chunk_1)], axis=0)

        df_chunk_2 = df_temp.iloc[half_size:]
        df_results = pd.concat([df_results, process_chunk(df_chunk_2)], axis=0)
    else:
        df_results = pd.concat([df_results, process_chunk(df_temp)], axis=0)

df_results.reset_index(drop=True, inplace=True)

100%|██████████████████████████████████████████████████████████████████████████████████| 34/34 [02:44<00:00,  4.83s/it]


In [10]:
df_results['TF-IDF_Words'].apply(set).apply(len).mean()

4.964163270810389

# Jaccard

In [13]:
grouped = df_results.groupby('PubYear')

In [14]:
yearly_results = []
for year, group in tqdm(grouped):
    patents = group['TF-IDF_Words']

    
    combination_sentences = [list(combinations(words, 2)) for words in patents]
    combination_sentences = [tuple(sorted(combi)) for sublist in combination_sentences for combi in sublist]
    combi_count = Counter(combination_sentences)

    word_associates = []
    for key, value in combi_count.items():
        word_associates.append([key[0], key[1], value])

    word_associates = pd.DataFrame(word_associates, columns=['word1', 'word2', 'intersections'])
    
    word_count = Counter(patents.explode())
    word_count = [[key, value] for key, value in word_count.items()]
    word_count = pd.DataFrame(word_count, columns=['word', 'count'])

    word_associates = pd.merge(
        word_associates,
        word_count.rename(columns={'word': 'word1'}),
        on='word1', 
        how='left'
    ).rename(columns={'count': 'count1'})

    word_associates = pd.merge(
        word_associates,
        word_count.rename(columns={'word': 'word2'}),
        on='word2', 
        how='left'
    ).rename(columns={'count': 'count2'})

    
    word_associates['union_count'] = word_associates['count1'] + word_associates['count2'] - word_associates['intersections']
    word_associates['jaccard_coef'] = word_associates['intersections'] / word_associates['union_count']
    
    word_associates = word_associates.sort_values(
        ['jaccard_coef', 'intersections'], 
        ascending=[False, False]
    )
    
    word_associates["year"] = year
    yearly_results.append(word_associates)

100%|██████████████████████████████████████████████████████████████████████████████████| 34/34 [00:12<00:00,  2.66it/s]


In [15]:
jaccard_coef_data_all = pd.concat(yearly_results, axis=0)
jaccard_coef_data_all

Unnamed: 0,word1,word2,intersections,count1,count2,union_count,jaccard_coef,year
246,flux,josephson,2,2,2,2,1.000000,1990
248,flux,magnetic,2,2,2,2,1.000000,1990
249,flux,quantum,2,2,2,2,1.000000,1990
251,josephson,magnetic,2,2,2,2,1.000000,1990
252,josephson,quantum,2,2,2,2,1.000000,1990
...,...,...,...,...,...,...,...,...
331435,image,wireless,1,3909,308,4216,0.000237,2023
65027,image,route,1,3909,375,4283,0.000233,2023
344998,battery,image,1,410,3909,4318,0.000232,2023
194057,image,resource,1,3909,479,4387,0.000228,2023


In [16]:
threshold = 5
jaccard_threshold = 0.06
intersections = 1
filtered_word_associates = jaccard_coef_data_all[(jaccard_coef_data_all['jaccard_coef'] >= jaccard_threshold) &
                                                 (jaccard_coef_data_all['count1'] >= threshold) & (jaccard_coef_data_all['count2'] >= threshold) &
                                                 (jaccard_coef_data_all['intersections']>= intersections)]
filtered_word_associates

Unnamed: 0,word1,word2,intersections,count1,count2,union_count,jaccard_coef,year
9929,link,strength,4,5,5,6,0.666667,1990
1048,date,pct,11,11,19,19,0.578947,1990
1964,angle,flight,4,6,6,8,0.500000,1990
791,float,gate,4,5,10,11,0.363636,1990
2135,hypothesis,opinion,3,7,5,9,0.333333,1990
...,...,...,...,...,...,...,...,...
333606,agency,mall,3,14,39,50,0.060000,2023
394962,leakage,uid,3,45,8,50,0.060000,2023
406887,cure,printing,3,8,45,50,0.060000,2023
416730,blood_vessel,endovascular,3,46,7,50,0.060000,2023


In [17]:
word_associates = filtered_word_associates[['word1', 'word2', 'year']]
edges =  word_associates.rename(columns={"word1": "source", "word2":"target"})
edges.to_csv(f'./data/edges/edges.csv', index=False)

no_dupes = edges.drop_duplicates(subset=['source', 'target'], keep='first')
no_dupes.to_csv(f'./data/edges/edges_no_duplicates.csv', index=False)

filtered_word_associates.to_csv('data/edges/jaccard_statistics.csv',index=False)

# Median split

In [18]:
year_to_tag = 2023

year_df = df_results[df_results['PubYear'] == year_to_tag]
all_words = [word for sublist in year_df['TF-IDF_Words'] for word in sublist]
word_counts = Counter(all_words)

In [19]:
year_edges = edges[edges['year'] == year_to_tag]
yearly_words = set(list(year_edges['source']) +  list(year_edges['target']))

filtered_counts = {word: word_counts[word] for word in yearly_words if word in word_counts}
median = np.median(list(filtered_counts.values()))

In [20]:
def tag_edges(row):
    if row['year'] == year_to_tag:
        if word_counts[row['source']] >= median:
            if word_counts[row['target']] >= median:
                return 'high-high'  # Both are above the median
            return 'low-high'  # Only source is above the median
        if word_counts[row['target']] >= median:
            return 'low-high'  # Only target is above the median
        return 'low-low'  # Both are below the median
    return 'no'

In [21]:
tagged_edges = edges
tagged_edges['Tag'] = edges.progress_apply(tag_edges, axis=1)

100%|████████████████████████████████████████████████████████████████████████| 84042/84042 [00:00<00:00, 191199.61it/s]


In [23]:
tagged_edges.to_csv('data/edges/tagged_edges.csv',index=False)

In [24]:
tagged_edges

Unnamed: 0,source,target,year,Tag
9929,link,strength,1990,no
1048,date,pct,1990,no
1964,angle,flight,1990,no
791,float,gate,1990,no
2135,hypothesis,opinion,1990,no
...,...,...,...,...
333606,agency,mall,2023,high-high
394962,leakage,uid,2023,low-high
406887,cure,printing,2023,low-high
416730,blood_vessel,endovascular,2023,low-high
