## Fetch popular tags data from StackExchange API

In [None]:
import requests
import time
import pandas as pd

fetch_count = 3000
max_page_size = 100
tags = []
for page in range(1, fetch_count // max_page_size + 1):
    response = requests.get(
        f'https://api.stackexchange.com/2.3/tags?page={page}&pagesize={max_page_size}&order=desc&sort=popular&site=stackoverflow')
    if response.status_code == 200:
        tags.extend(response.json()['items'])
    time.sleep(1)  # To prevent hitting rate limits


tags_df = pd.DataFrame(tags)
tags_df.to_csv('tempdata/stach_exchange_tags.csv', index=False)

## Fetch synonyms data of popular tags from StackExchange API

In [None]:
import requests
import time
import pandas as pd
from urllib.parse import quote

tags_df = pd.read_csv('tempdata/stach_exchange_tags.csv')

synonyms = []
tags_has_synonyms = tags_df[tags_df['has_synonyms'] == True]
chunked_tags = [tags_has_synonyms[i:i + 10] for i in range(0, len(tags_has_synonyms), 10)]
for chunk in chunked_tags:
    has_next = True
    page = 0
    joined_tags = quote(';'.join(map(str, chunk['name'].values)))
    while has_next:
        page += 1
        request_url = f'https://api.stackexchange.com/2.3/tags/{joined_tags}/synonyms?pagesize=100&page={page}&order=desc&min=1609459200&sort=activity&site=stackoverflow'
        response = requests.get(request_url)
        synonyms += [{
            "to_tag": synonym['to_tag'],
            "from_tag": synonym['from_tag'],
        } for synonym in response.json()['items']]
        has_next = response.json()['has_more']
        time.sleep(1)  # To prevent hitting rate limits
    
synonyms_df = pd.DataFrame(synonyms)
grouped_synonyms_df = synonyms_df.groupby('to_tag')['from_tag'].apply(list).reset_index(name='synonyms')
grouped_synonyms_df.to_csv('tempdata/stack_exchange_tag_synonyms.csv', index=False)

## Fetch StackShare's tool page site map data

In [None]:

from bs4 import BeautifulSoup
import requests
import re
import time
import pandas as pd

stackshare_sitemap_url = 'https://stackshare.io/sitemap.xml'
response = requests.get(stackshare_sitemap_url)
stachshare_sitemap = BeautifulSoup(response.content, 'xml')
pattern = re.compile(r'.*tools\d*\.xml$')
tools_sitemap_urls = [loc.text for loc in stachshare_sitemap.find_all(
    'loc') if pattern.match(loc.text)]

stackshare_techs = []
for url in tools_sitemap_urls:
    response = requests.get(url)
    tools_sitemap = BeautifulSoup(response.content, 'xml')
    stackshare_techs += [
        {
            'url': loc.text,
            'label': loc.text.split('/')[-1],
            'unhyphenated_label': loc.text.split('/')[-1].replace('-', ''),
        }
        for loc in tools_sitemap.find_all('loc')
    ]
    time.sleep(1)  # To prevent hitting rate limits

stackshare_techs_df = pd.DataFrame(stackshare_techs)
stackshare_techs_df.to_csv(
    'tempdata/stackshare_techs.csv', index=False)

## Join synonyms data with popular tags data

In [9]:
import pandas as pd

tags_df = pd.read_csv('tempdata/stach_exchange_tags.csv')
synonyms_df = pd.read_csv('tempdata/stack_exchange_tag_synonyms.csv', converters={'synonyms': eval})
tags_with_synonym_df = pd.merge(tags_df, synonyms_df, left_on='name',
                   right_on='to_tag', how='left')
tags_with_synonym_df['is_valid'] = False
tags_with_synonym_df['stackshare_url'] = ''
tags_with_synonym_df['stackshare_label'] = ''
tags_with_synonym_df['name'] = tags_with_synonym_df['name'].astype(str)

## Validate stackexchange tags with stackshare data

In [13]:
import numpy as np
import pandas as pd
from typing import List

stackshare_techs_df = pd.read_csv('tempdata/stackshare_techs.csv')

def find_label_in_stackshare(label: str):
    return stackshare_techs_df[(stackshare_techs_df['label'] == label) | (stackshare_techs_df['unhyphenated_label'] == label)]

def find_in_stackshare(label: str, synonyms: List[str] = []):
    if find_label_in_stackshare(label).shape[0] > 0:
        return find_label_in_stackshare(label).iloc[0]
    if len(str(tag['name'])) <= 2 and find_label_in_stackshare(f"{label}-lang").shape[0] > 0:
        return find_label_in_stackshare(f"{label}-lang").iloc[0]
    if synonyms is not np.nan:
        for synonym in synonyms:
            if find_label_in_stackshare(synonym).shape[0] > 0:
                return find_label_in_stackshare(synonym).iloc[0]

for index, tag in tags_with_synonym_df.iterrows():
    try:
        result = find_in_stackshare(tag['name'], tag['synonyms'])
        if result is not None:
            tags_with_synonym_df.loc[index, 'is_valid'] = True
            tags_with_synonym_df.loc[index, 'stackshare_url'] = result['url']
            tags_with_synonym_df.loc[index, 'stackshare_label'] = result['label']
    except:
        print(tag)
        raise

tags_with_synonym_df['display_name'] = tags_with_synonym_df['name'].apply(lambda x: x.replace('-', ' '))
tags_with_synonym_df[['display_name', 'is_valid', 'count', 'stackshare_url', 'stackshare_label']].to_csv('data/stack_exchange_tags_with_stackshare.csv', index=False)