# TechCrunch articles analysis

> Imports

In [16]:
import matplotlib.pyplot as plt
import os
import pandas as pd
import plotly.express as px
import spacy

from collections import Counter
from datetime import datetime
from dotenv import load_dotenv
from gliner import GLiNER
from ipywidgets import interact, Dropdown, fixed
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from tqdm import tqdm
from transformers import pipeline
from utils import NONE_CLASSIFIED, TECH_STOPWORDS, SUB_CATEGORIES
from wordcloud import WordCloud, STOPWORDS

load_dotenv()

True

> We set up the connection with the `MongoDB` cluster to read its data

In [2]:
uri = os.getenv('MONGO_PUBLIC_URI')
client = MongoClient(uri, server_api=ServerApi('1'))
db = client['tech_scraper_db']
data = db['articles']

> We look for the count of published articles par category and per year to see which ones might be the most popular through the years

In [3]:
pipeline_macro = [
    {
        '$group': {
            '_id': {
                'year': {'$year': '$timestamp'},
                'category': '$category'
                },
            'count': {'$sum': 1}
        }
    },
    {'$sort': {'_id.year': 1}}
]

data_macro = list(data.aggregate(pipeline_macro))
df_macro = pd.DataFrame([
    {'year': d['_id']['year'], 'category': d['_id']['category'], 'count': d['count']}
    for d in data_macro
])

def plot_main_categories(df, n=10):
    top_categories = df_macro.groupby('category')['count'].sum().nlargest(n).index
    df_plot1 = df_macro[df_macro['category'].isin(top_categories)]

    fig1 = px.line(df_plot1, x='year', y='count', color='category',
                title=f'top {n} techcrunch categories over time',
                template='plotly_dark', line_shape='spline')
    fig1.show()
    
plot_main_categories(df_macro)

> It seems like the `none` category represents a big part of the data we have. looking at the numbers through the years it seems like the articles didn't really have a category feature at first and it progressively became a habit to give one to the articles.

> We can also see that the categories `hardware`, `startups` and `media & entertainment` represent a vast majority of the articles with a category for the period 2010 to 2022. Again it could be a bad habit coming directly from the media as they only had these categories at the time and took time to start using the other ones to efficently classify their articles.

> Using this graphic we can still make some observations like the evolution of `AI`. We can see its beginnings around 2015 with a little amounts of articles. Then around 2016/17 there is a first surge with some acomplishments in the field and then the huge spike in popularity around 2023 with the worldwide expension and democratization of `AI` tools and applications.

> Seeing this is interesting but it would be even more interesting if we could see the popuplarity of sub categories from these. For example, in the `AI` category it would be interesting to see all the main companies and models. When they appeared. Which specific topics of the field were more popular before but not so much anymore, etc. We could also look for the main actors of the tech industry in the recent years, comparing the mentions of people, companies, etc.

> That is precisely we're going to do next. We're going first to try to display word clouds for each each year to see the most popular words in the articles of that year

In [None]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
nlp.max_length = 2000000
def get_actors_only(text):
    doc = nlp(text)
    keywords = [token.text for token in doc if token.pos_ in ['NOUN', 'PROPN']]
    return ' '.join(keywords)

year_text = {}
for doc in data.find({}, {'title': 1, 'timestamp': 1}):
    year = doc['timestamp'].year
    year_text[year] = year_text.get(year, '') + ' ' + doc['title']
    
tech_stopwords = set(STOPWORDS)
tech_stopwords.update(TECH_STOPWORDS)

def show_wordcloud(year):
    text = get_actors_only(year_text[year])
    wc = WordCloud(
        width=800,
        height=400,
        background_color='black',
        colormap='magma',
        stopwords=tech_stopwords,
        collocations=True,
        collocation_threshold=10,
        max_words=100,
        font_path=None
    ).generate(text)
    plt.figure(figsize=(12,8))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'top trends in {year}')
    plt.style.use('dark_background')
    plt.show()
    
interact(show_wordcloud, year=(2010, 2026))

interactive(children=(IntSlider(value=2018, description='year', max=2026, min=2010), Output()), _dom_classes=(…

<function __main__.show_wordcloud(year)>

> It looks rather good with some interesting terms revealing the trends of each year but it does look a bit unordered yet to be able to retrieve a list of specific terms. There also seem to always remain some noise in terms of words that are going to be accepted and displayed nto the graph.

> With the next step, we will use a light nlp model that allows us to "invent" on the go the categories we want for a dataset to inspect and classify. Here we return the top N most mentioned entities of specific sub categories for a chosen year.

> Note: we sampled the data and we remain in the domain of a year max for this function because else it would take too much time to process for this notebook

In [13]:
model = GLiNER.from_pretrained('urchade/gliner_medium-v2.1')

def get_tops(year, n=30, sample_size=500):
    pipeline = [
        {
            '$match': {
                'timestamp': {
                    '$gte': datetime(year, 1, 1),
                    '$lt': datetime(year + 1, 1, 1)
                }
            }
        },
        {'$sample': {'size': sample_size}},
        {'$project': {'title': 1}}
    ]
    
    titles = list(data.aggregate(pipeline))
    labels = ['person', 'company', 'location', 'genai model']
    res = {l: [] for l in labels}
    for t in tqdm(titles, desc=f'extracting entities for {year}'):
        entities = model.predict_entities(t['title'], labels, threshold=.75)
        for ent in entities:
            if ent['label'] in res:
                res[ent['label']].append(ent['text'].strip())
    return {l: Counter(ents).most_common(n) for l, ents in res.items()}

tops = get_tops(2025, sample_size=500)

for label, top in tops.items():
    print(f'{label}: {[name for name, _ in top]}')


The `resume_download` argument is deprecated and ignored in `snapshot_download`. Downloads always resume whenever possible.



Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

extracting entities for 2025: 100%|██████████| 500/500 [00:45<00:00, 10.98it/s]

person: ['Elon Musk', 'Trump', 'CEO', 'Musk', 'Bluesky', 'Sam Altman', 'Roelof Botha', 'Aaron Levie', 'Brian Armstrong', 'Jack Dorsey', 'Jesse Draper', 'Danielle Perszyk', 'Anatoly Yakovenko', 'This founder', 'Construction workers', 'Temu', 'OpenAI exec', 'Chris Hyams', 'Hisayuki Idekoba', 'Garry Tan', 'I', 'Brad Smith', 'Spinny', 'riders', 'Matthew Prince', 'Kiki', 'Humphrey', 'Amanda Kahlow', 'Jack Hartung', 'Dario Amodei']
company: ['OpenAI', 'Google', 'Apple', 'Tesla', 'Spotify', 'Amazon', 'TikTok', 'DeepSeek', 'Nvidia', 'Microsoft', 'SpaceX', 'Meta', 'xAI', 'GM', 'Intel', 'Anthropic', 'TechCrunch', 'Waymo', 'X', 'Netflix', 'Ford', 'YouTube', 'ChatGPT', 'Instagram', 'Vine', 'Coinbase', 'Sony', 'Alphabet', 'Moonvalley', 'Adobe']
location: ['India', 'US', 'UK', 'California', 'data centers', 'China', 'NYC', 'ChatGPT', 'Washington', 'New York', 'Malaysia', 'Mars-bound', 'Delaware', 'MENA', 'Unit 221B', 'Nevada', 'SXSW', 'Ireland', 'Italy', 'understaffed schools', 'Silicon Valley', 'San




> With that data in mind we can have a proper `SUB_CATEGORY` group listing what to search for exactly. This way we can display another graph for the evolution of articles categories through the years but this time isntead of large categories it can be reduced to more specific topics

In [17]:
df_sub = pd.DataFrame(list(data.find({'timestamp': {'$lt': datetime(2026, 1, 1)}}, {'title': 1, 'timestamp': 1})))
df_sub['year'] = pd.to_datetime(df_sub['timestamp']).dt.year
df_sub['month_year'] = pd.to_datetime(df_sub['timestamp']).dt.to_period('M').dt.to_timestamp()

@interact(
    df=fixed(df_sub),
    target=Dropdown(options=SUB_CATEGORIES.keys()),
    splitter=Dropdown(options=['year', 'month'], value='year')
)
def plot_sub_categories(df, target, splitter):
    keywords = SUB_CATEGORIES.get(target, {})
    data_list = []
    group_col = 'year' if splitter == 'year' else 'month_year'
    for label, regex in keywords.items():
        mask = df['title'].str.contains(regex, case=False, na=False)
        counts = df[mask].groupby(group_col).size().reset_index(name='count')
        counts['keyword'] = label
        data_list.append(counts)
        
    if data_list:
        final_df = pd.concat(data_list)
        fig = px.line(final_df, x=group_col, y='count', color='keyword',
                    title=f'evolution of keywords within the {target} category',
                    template='plotly_dark', line_shape='spline')
        if splitter == 'month':
            fig.update_xaxes(dtick='M6', tickformat='%b %Y')
        fig.show()

interactive(children=(Dropdown(description='target', options=('ai', 'genai models', 'people'), value='ai'), Dr…

> Finally we'll try to analyse a bit these articles without any category. We'll run a simple transformer on the article titles to categorize them in one of the listed possible labels

> Note: This relabelization work only concerns the local dataset for curiosity and observation purpose only. We won't update the labels in the `MongoDB` cluster as this part of the project is only a bonus and should not interfere with the main part without being instructed otherwise

In [7]:
candidate_labels = ['AI', 'Media & Entertainment', 'Hardware', 'Enterprise', 'Crypto', 'Fintech', 'Security', 'Climate']
classifier = pipeline('zero-shot-classification',
                      model='valhalla/distilbart-mnli-12-3',
                      device=-1)

file_name = NONE_CLASSIFIED
if os.path.exists(file_name):
    df_none = pd.read_pickle(file_name)
else:
    df_none = pd.DataFrame(list(data.find({"category": "none"}, {"title": 1, 'timestamp': 1})))
    df_none['predicted_cat'] = None
    
def run_batch(df, batch_size=1000):
    pending = df[df['predicted_cat'].isna()].head(batch_size)
    
    if pending.empty:
        print('all articles classified')
        return df
    
    titles = pending['title'].tolist()
    results = classifier(titles, candidate_labels)
    
    for i, res in enumerate(results):
        df.at[pending.index[i], 'predicted_cat'] = res['labels'][0]
        
    df.to_pickle(file_name)
    print(f'processed {len(pending)} articles. saved to {file_name}')
    return df

df_none = run_batch(df_none, batch_size=100)
df_none.head()

Loading weights:   0%|          | 0/283 [00:00<?, ?it/s]



processed 100 articles. saved to none_classified.pkl


Unnamed: 0,_id,timestamp,title,predicted_cat
0,699e3b1c2f53ee3153ee73b6,2025-08-08 15:46:12,OpenAI just made an offer the government can’t...,AI
1,699e3b1c2f53ee3153ee76b2,2025-06-25 16:44:49,How one biotech startup is betting on cows and...,Enterprise
2,699e3b522f53ee3153eedd9b,2022-09-28 19:29:43,Vietnam to restrict which social media account...,Media & Entertainment
3,699e3b522f53ee3153eeddad,2022-09-28 17:18:36,Google to launch its image and text-based ‘Mul...,AI
4,699e3b522f53ee3153eeddae,2022-09-28 17:18:17,Google turns to machine learning to advance tr...,AI


> This is a long work to do so it has been split into batches of work.

> Although we won't update the online database, we can still use our local updated one to display once more the main categories with everything correctly labelled and see if we get any difference from it

In [8]:
if not os.path.exists(NONE_CLASSIFIED):
    print(f'error: {NONE_CLASSIFIED} not found. run previous cell at least once')
    raise FileNotFoundError('missing cassification pickle file')

df_full = pd.DataFrame(list(data.find({}, {'category': 1, 'timestamp':1, 'title': 1})))
df_none_res = pd.read_pickle(NONE_CLASSIFIED)
mapping = df_none_res[df_none_res['predicted_cat'].notna()].set_index('_id')['predicted_cat']
df_full.loc[df_full['category'] == 'none', 'category'] = df_full['_id'].map(mapping).fillna('none')
plot_main_categories(df_full)