## This code builds a weighted network of technology clusters.

In [None]:
import pandas as pd
import os 
import re
import numpy as np
#import spacy
import string
pd.set_option('mode.chained_assignment', None)

In [None]:
#Loading files with links and pages characteristics
link=pd.read_csv('link.csv')
link=link[['from','to']]
link=link.drop_duplicates()

page = pd.read_csv('enwiki-20191001-page.csv')
locations_df=pd.read_csv('locations.csv') #list of geolocationos stored in csv format
locations='|'.join(locations_df['location'].tolist())
stop_page='companion|user|templates|shared ip addresses from educational institutions|categories|category|use mdy dates|use dmy dates|self-published work|cs1|self-published work|images of|lists of|redirects|redirected|userbox|webarchive|infobox|articles|wikipedia|pages|wikidata|files with no machine-readable|non-free|all free|wikimedia|all-free|webarchive template wayback links|wiki|knight|town|village|culture|history|bishop|school|street|student|restaurant|mayor|festival|death|politician|politics|architecture|musical group|musical band|writer|musician|tourist attraction|commander|king|queen|church|museum|university|gallery|cathedral|neighbourhood|companies|company|country|birth|death|missing|year|award|template:|infobox:|fictional|movie|theatre|tournament|styles|names|novel|people|album|artist|ballet|discography|dance|opera|contest|song|singer|actor|actress|performer|journal|magazine|newsapaper|television series|television episode|television channel|television program|bibliograph|genre|poem|country|municipality|village|historic|event|holiday|person|player|biography|religion|religious|company|military conflict|currency|celebrity'

page=page[['title', 'id','namespace']]
page['title'] = page['title'].str.replace('_',' ')
pages_select=page[(page['namespace']==0) | (page['namespace']==14)] #selecting only main articles and category articles
pages_select=pages_select.drop(['namespace'], axis=1)
pages_select = pages_select[~pages_select["title"].str.contains(locations, na=False)]
pages_select["title"] = pages_select["title"].str.lower()
pages_select = pages_select[~pages_select['title'].str.contains(stop_page, na=False)]
pages_select=pages_select[~pages_select['title'].str.contains(r'[0-9]', na=False)]
pages_select=pages_select.reset_index(drop=True)
pages_select=pages_select.drop_duplicates()

#Making sure we have only 'useful' links -- the ones that connect articles and subcategories only
link=pd.merge(link, pages_select, how='inner', left_on='from', right_on='id')

link=link[['to','title']]
link=link.rename(columns={'to':'link_to','title':'link_from'})
link=link.drop_duplicates()
link['link_to']=link['link_to'].str.strip()
link['link_to']=link['link_to'].astype(str)
link['link_to']=link['link_to'].apply(lambda x:''.join([i for i in x if i not in string.punctuation]))
link['length']=link['link_to'].str.len()
link=link[link['length']>2]
link=link.drop(['length'], axis=1)
link['link_from']=link['link_from'].str.strip()
link['link_from']=link['link_from'].astype(str)
link['link_from']=link['link_from'].apply(lambda x:''.join([i for i in x if i not in string.punctuation]))
link['length']=link['link_from'].str.len()
link=link[link['length']>2]
link=link.drop(['length'], axis=1)

#Uploading file with the complete patent-article matches
df=pd.read_csv('complete_matches_full.csv')
df=df[['article_title']]
df=df.drop_duplicates()

#Making sure we are not losing stand-alone articles
temp=link[['link_to']]
temp=temp.drop_duplicates()
temp['link_from']=temp['link_to']
link=link.append(temp, ignore_index=True)
link=link.drop_duplicates()
link=pd.merge(link, df, how='inner', right_on='article_title', left_on='link_from')
link=link[['link_from', 'link_to']]
link=link.drop_duplicates()


####### Now we are building 2nd order links (between A and C if A and C are connected through B) and 3rd order links

In [None]:
link_all=link.copy()
link_all['level']=1 #first-hand connection have level 1
link=link[['link_from', 'link_to']]

link2=pd.merge(link, link, how='inner', left_on='link_to', right_on='link_from')
link2=link2[['link_from_x','link_to_y']]
link2=link2[link2['link_from_x']!=link2['link_to_y']]
link2.columns=['link_from','link_to']
link2=link2.drop_duplicates()
link2['level']=2 #connections through one mediator have level 2
link_all=link_all.append(link2, ignore_index=True)
link_all.sort_values(by=['link_from', 'link_to', 'level'], ascending=True)
link_all=link_all.drop_duplicates(subset=['link_from', 'link_to'], keep='first')

link3=link2.drop(['level'], axis=1)
link3=pd.merge(link3, link, how='inner', left_on='link_to', right_on='link_from')
link3=link3[['link_from_x','link_to_y']]
link3=link3[link3['link_from_x']!=link3['link_to_y']]
link3.columns=['link_from','link_to']
link3['level']=3 #connections through two mediators have level 3
link3=link3[link3['link_to']!=link3['link_from']]
link3=link3.drop_duplicates()
link_all=link_all.append(link3, ignore_index=True)
link_all.sort_values(by=['link_from', 'link_to', 'level'], ascending=True)
link_all=link_all.drop_duplicates(subset=['link_from', 'link_to'], keep='first')

link_all.to_csv(r'link_all.csv', index = None) 