In [1]:
import pandas as pd
import numpy as np
import math
import anthropic
import json
import time
from typing import List, Dict
from dotenv import load_dotenv, find_dotenv
import os


load_dotenv(find_dotenv())
api_key = os.getenv("ANTHROPIC_API_KEY")

if not api_key:
    raise ValueError("No API key found. Please set ANTHROPIC_API_KEY in your environment variables.")

df = pd.read_csv('companies.csv')
df.head()

Unnamed: 0,Name,Description,Stars,Issues,Pull requests,Forks,Website
0,https://github.com/resemble-ai/chatterbox,SoTA open-source TTS,12662,251,110,1606,https://resemble-ai.github.io/chatterbox_demop...
1,https://github.com/simstudioai/sim,Open-source platform to build and deploy AI ag...,14774,1304,1077,1835,https://www.sim.ai
2,https://github.com/oraios/serena,A powerful coding agent toolkit providing sema...,12124,545,245,832,
3,https://github.com/getzep/graphiti,Build Real-Time Knowledge Graphs for AI Agents,18019,835,641,1618,https://help.getzep.com/graphiti
4,https://github.com/11cafe/jaaz,The world's first open-source multimodal creat...,4313,220,226,354,https://jaaz.app


In [2]:
def calculate_raw_score(row):
    """
    Calculate the raw score for a company using the weighted scoring formula:
    Score_raw = 0.5(l_S) + 0.3(l_F) + 0.2(l_P) - 2ln(1 + IB)
    
    Where:
    - l_X = ln(1 + X) for log1p transformation
    - IB = I/(S + k) to scale issues by traction
    - S = Stars, F = Forks, P = Pull Requests, I = Issues
    """
    S = row['Stars']
    F = row['Forks']
    P = row['Pull requests']
    I = row['Issues']
    k = 10
    
    # Apply log1p transformation: l_X = ln(1 + X)
    l_S = math.log1p(S)
    l_F = math.log1p(F)
    l_P = math.log1p(P)
    
    # Calculate IB = I/(S + k) to scale issues by traction
    IB = I / (S + k)
    
    # Calculate raw score using the formula
    score_raw = 0.5 * l_S + 0.3 * l_F + 0.2 * l_P - 2 * math.log1p(IB)
    
    return score_raw

df['Score_raw'] = df.apply(calculate_raw_score, axis=1)

print("Raw score statistics:")
print(f"Mean: {df['Score_raw'].mean():.4f}")
print(f"Std: {df['Score_raw'].std():.4f}")
print(f"Min: {df['Score_raw'].min():.4f}")
print(f"Max: {df['Score_raw'].max():.4f}")

# Show top 10 companies by raw score
print("\nTop 10 companies by raw score:")
top_companies = df.nlargest(10, 'Score_raw')[['Name', 'Website', 'Stars', 'Forks', 'Pull requests', 'Issues', 'Score_raw']]
print(top_companies)
print(f"Min: {df['Score_raw'].min():.4f}")
print(f"Max: {df['Score_raw'].max():.4f}")


Raw score statistics:
Mean: 7.3150
Std: 1.1538
Min: 3.7589
Max: 9.4271

Top 10 companies by raw score:
                                              Name  \
71        https://github.com/OpenBB-finance/OpenBB   
86             https://github.com/milvus-io/milvus   
77         https://github.com/appsmithorg/appsmith   
78              https://github.com/ToolJet/ToolJet   
68                  https://github.com/novuhq/novu   
10              https://github.com/twentyhq/twenty   
65              https://github.com/makeplane/plane   
82            https://github.com/go-skynet/LocalAI   
92                 https://github.com/grafana/loki   
70  https://github.com/paperless-ngx/paperless-ngx   

                           Website  Stars  Forks  Pull requests  Issues  \
71               https://openbb.co  52175   4954           4734    6922   
86               https://milvus.io  37128   3391          26649    9815   
77        https://www.appsmith.com  37945   4228          17461    9852   
78

In [3]:
import re

categories = ['intelligent_collaboration', 'automated_workflow', 'scalable_infrastructure', 'autonomy']
def get_narrative_alignment_scores_batch(companies_data: List[Dict], category: str, client, batch_size: int = 5) -> List[float]:
    """
    Args:
        companies_data: List of dicts with 'name' and 'description' keys
        category: Investment category to score against
        client: Anthropic client
        batch_size: Number of companies to process in each batch (default: 5)
    
    Returns:
        List of scores from 0.0 to 1.0
    """
    
    category_descriptions = {
        "intelligent_collaboration": "Tools and platforms that enhance team collaboration, communication, and collective intelligence through AI or smart automation",
        "automated_workflow": "Solutions that automate business processes, reduce manual work, and streamline operations",
        "scalable_infrastructure": "Backend systems, databases, cloud platforms, and infrastructure that can scale efficiently",
        "autonomy": "AI agents, autonomous systems, and technologies that can operate independently with minimal human intervention"
    }
    
    all_scores = []
    
    # Process companies in batches
    for i in range(0, len(companies_data), batch_size):
        batch = companies_data[i:i + batch_size]
        
        # Create batch prompt for multiple companies
        batch_prompt = f"""You are a venture capital analyst evaluating technology companies for investment potential.

Investment Category: {category}
Category Definition: {category_descriptions.get(category, '')}

Rate how well each company's product/service aligns with the investment category on a scale of 0.0 to 1.0:
- 0.0-0.2: No alignment or relevance
- 0.2-0.4: Weak alignment, tangential relevance  
- 0.4-0.6: Moderate alignment, some strategic fit
- 0.6-0.8: Strong alignment, good strategic fit
- 0.8-1.0: Excellent alignment, perfect strategic fit

Consider:
- Direct product relevance to the category
- Market positioning and use cases
- Innovation potential in this space
- Strategic value for the investment thesis

Companies to evaluate:
"""
        
        for idx, company in enumerate(batch, 1):
            batch_prompt += f"\n{idx}. Company: {company['name']}\n   Description: {company['description']}\n"
        
        batch_prompt += f"""
IMPORTANT: Respond with ONLY the numeric scores, one per line, in the same order as listed above.
Example response format:
0.75
0.42
0.88"""

        try:
            print(f"  Processing batch {i//batch_size + 1}/{(len(companies_data) + batch_size - 1)//batch_size} ({len(batch)} companies)")
            
            message = client.messages.create(
                model="claude-sonnet-4-20250514",
                max_tokens=200,
                temperature=0.1,
                messages=[{"role": "user", "content": batch_prompt}]
            )
            
            response_text = message.content[0].text.strip()
            scores_text = [line.strip() for line in response_text.split('\n') if line.strip()]
            
            batch_scores = []
            for score_text in scores_text:
                try:
                    numbers = re.findall(r'\b0\.\d+|\b1\.0+\b', score_text)
                    if numbers:
                        score = float(numbers[0])
                        batch_scores.append(max(0.0, min(1.0, score)))
                    else:
                        batch_scores.append(0.5)  
                except (ValueError, IndexError):
                    batch_scores.append(0.5) 
            
            while len(batch_scores) < len(batch):
                batch_scores.append(0.5)
            batch_scores = batch_scores[:len(batch)]
            
            all_scores.extend(batch_scores)
            
            # Rate limiting between batches (important for API limits)
            if i + batch_size < len(companies_data): 
                time.sleep(1.5)  
                
        except Exception as e:
            print(f"  Error processing batch for {category}: {e}")
            time.sleep(2.0) 
    
    return all_scores

print("Processing 5 companies per batch to optimize API usage\n")

client = anthropic.Anthropic(api_key=api_key)

companies_data = []
for idx, row in df.iterrows():
    companies_data.append({
        'name': row['Name'],
        'description': row['Description']
    })

# Score each company for each category using batch processing
for category in categories:
    print(f"Scoring companies for: {category}")
    
    scores = get_narrative_alignment_scores_batch(
        companies_data=companies_data,
        category=category,
        client=client,
        batch_size=5
    )
    
    df[f"s_{category}"] = scores
    print(f"Completed! Mean score: {np.mean(scores):.3f}, Std: {np.std(scores):.3f}")
    print()

print("🎉 Batch narrative scoring completed!")


Processing 5 companies per batch to optimize API usage

Scoring companies for: intelligent_collaboration
  Processing batch 1/20 (5 companies)
  Processing batch 2/20 (5 companies)
  Processing batch 3/20 (5 companies)
  Processing batch 4/20 (5 companies)
  Processing batch 5/20 (5 companies)
  Processing batch 6/20 (5 companies)
  Processing batch 7/20 (5 companies)
  Processing batch 8/20 (5 companies)
  Processing batch 9/20 (5 companies)
  Processing batch 10/20 (5 companies)
  Processing batch 11/20 (5 companies)
  Processing batch 12/20 (5 companies)
  Processing batch 13/20 (5 companies)
  Processing batch 14/20 (5 companies)
  Processing batch 15/20 (5 companies)
  Processing batch 16/20 (5 companies)
  Processing batch 17/20 (5 companies)
  Processing batch 18/20 (5 companies)
  Processing batch 19/20 (5 companies)
  Processing batch 20/20 (5 companies)
Completed! Mean score: 0.393, Std: 0.215

Scoring companies for: automated_workflow
  Processing batch 1/20 (5 companies)
  

In [None]:
# Define category weights (adjust later based on firm priorities)
w_vector = np.array([0.25, 0.25, 0.25, 0.25])  # weights for [intelligent_collaboration, automated_workflow, scalable_infrastructure, autonomy]
c = 0.35

print("Investment category weights:")
for i, category in enumerate(categories):
    print(f"  {category}: {w_vector[i]:.1f}")
print(f"  Total: {w_vector.sum():.1f}\n")

eps = 1e-9

# Calculate weighted narrative score S ∈ [0,1]
S = (df[[f"s_{k}" for k in categories]] * w_vector).sum(axis=1)
df['narrative_score'] = S

# Standardize narrative scores (z-score normalization)
S_z = (S - S.mean()) / (S.std(ddof=0) + eps)
df['narrative_z_score'] = S_z

# Calculate adaptive scaling factor
alpha = c * df["Score_raw"].std(ddof=0)

# Apply narrative adjustment to raw scores
df["Score_raw_adj"] = df["Score_raw"] + alpha * S_z

print(f"\nScaling factor (alpha): {alpha:.4f}")
print(f"Max narrative adjustment: ±{alpha * 2:.4f} points")

print(f"\nAdjusted score statistics:")
print(f"  Mean: {df['Score_raw_adj'].mean():.4f}")
print(f"  Std: {df['Score_raw_adj'].std():.4f}")
print(f"  Min: {df['Score_raw_adj'].min():.4f}")
print(f"  Max: {df['Score_raw_adj'].max():.4f}")


Investment category weights:
  intelligent_collaboration: 0.2
  automated_workflow: 0.2
  scalable_infrastructure: 0.2
  autonomy: 0.2
  Total: 1.0


Scaling factor (alpha): 0.4018
Max narrative adjustment: ±0.8036 points

Adjusted score statistics:
  Mean: 7.3150
  Std: 1.2863
  Min: 3.1562
  Max: 9.5760


In [5]:
# Compare rankings before and after narrative adjustment
print("RANKING COMPARISON: Before vs After Narrative Adjustment")
print("="*80)

# Get top 15 companies by original raw score
top_original = df.nlargest(15, 'Score_raw')[['Name', 'Score_raw', 'narrative_score', 'Score_raw_adj']].reset_index(drop=True)
top_original['original_rank'] = range(1, len(top_original) + 1)

# Get top 15 companies by adjusted score
top_adjusted = df.nlargest(15, 'Score_raw_adj')[['Name', 'Score_raw', 'narrative_score', 'Score_raw_adj']].reset_index(drop=True)
top_adjusted['adjusted_rank'] = range(1, len(top_adjusted) + 1)

# Merge to see rank changes
comparison = top_adjusted.merge(
    top_original[['Name', 'original_rank']], 
    on='Name', 
    how='left'
)
comparison['original_rank'] = comparison['original_rank'].fillna(16)  # Companies not in original top 15
comparison['rank_change'] = comparison['original_rank'] - comparison['adjusted_rank']

print("Top 15 Companies After Narrative Adjustment:")
print(f"{'Rank':<4} {'Company':<45} {'Raw':<6} {'Narr':<6} {'Adj':<6} {'Δ Rank':<7}")
print("-" * 80)

for idx, row in comparison.iterrows():
    rank_change_str = f"+{int(row['rank_change'])}" if row['rank_change'] > 0 else str(int(row['rank_change']))
    if row['original_rank'] > 15:
        rank_change_str = "NEW"
    
    company_name = row['Name'].split('/')[-1] if '/' in row['Name'] else row['Name']
    if len(company_name) > 44:
        company_name = company_name[:41] + "..."
    
    print(f"{idx+1:<4} {company_name:<45} {row['Score_raw']:<6.2f} {row['narrative_score']:<6.3f} {row['Score_raw_adj']:<6.2f} {rank_change_str:<7}")

# Show biggest movers
print("\nBiggest Positive Movers (narrative boost):")
positive_movers = comparison[comparison['rank_change'] > 0].nlargest(5, 'rank_change')
for _, row in positive_movers.iterrows():
    company_name = row['Name'].split('/')[-1] if '/' in row['Name'] else row['Name']
    print(f"  {company_name}: {int(row['original_rank'])} → {int(row['adjusted_rank'])} (+{int(row['rank_change'])} positions)")

print("\nBiggest Negative Movers (narrative penalty):")
negative_movers = comparison[comparison['rank_change'] < 0].nsmallest(5, 'rank_change')
for _, row in negative_movers.iterrows():
    company_name = row['Name'].split('/')[-1] if '/' in row['Name'] else row['Name']
    print(f"  {company_name}: {int(row['original_rank'])} → {int(row['adjusted_rank'])} ({int(row['rank_change'])} positions)")


RANKING COMPARISON: Before vs After Narrative Adjustment
Top 15 Companies After Narrative Adjustment:
Rank Company                                       Raw    Narr   Adj    Δ Rank 
--------------------------------------------------------------------------------
1    ToolJet                                       9.13   0.675  9.58   +3     
2    milvus                                        9.27   0.605  9.53   0      
3    novu                                          9.12   0.633  9.45   +2     
4    appsmith                                      9.27   0.532  9.33   -1     
5    CopilotKit                                    8.73   0.703  9.25   +6     
6    plane                                         9.04   0.575  9.22   +1     
7    LocalAI                                       9.00   0.588  9.21   +1     
8    OpenBB                                        9.43   0.400  9.13   -7     
9    twenty                                        9.05   0.525  9.09   -3     
10   sim         

In [None]:
# Normalize adjusted scores to 0-100 scale
def normalize_scores_to_100(scores):
    """Normalize scores to 0-100 scale using min-max normalization"""
    min_score = min(scores)
    max_score = max(scores)
    score_range = max_score - min_score
    return [((score - min_score) / score_range) * 100 for score in scores]

# Apply normalization to adjusted scores
df['normalized_score'] = normalize_scores_to_100(df['Score_raw_adj'])

name_score_map = dict(zip(df['Name'], df['normalized_score'])) # No longer needed 

print("FINAL VC RECOMMENDATION SCORES (0-100 scale)")
print("="*70)
print(f"Total companies scored: {len(df)}")
print(f"Average score: {df['normalized_score'].mean():.1f}/100")
print(f"Score range: {df['normalized_score'].min():.1f} - {df['normalized_score'].max():.1f}")

print("\nTop 20 Companies for VC Investment:")
print(f"{'Rank':<4} {'Company':<45} {'Score':<8} {'Raw':<6} {'Narrative':<9}")
print("-" * 75)

top_final = df.nlargest(20, 'normalized_score')
for idx, (_, row) in enumerate(top_final.iterrows()):
    company_name = row['Name'].split('/')[-1] if '/' in row['Name'] else row['Name']
    if len(company_name) > 44:
        company_name = company_name[:41] + "..."
    
    print(f"{idx+1:<4} {company_name:<45} {row['normalized_score']:<8.1f} {row['Score_raw']:<6.2f} {row['narrative_score']:<9.3f}")



FINAL VC RECOMMENDATION SCORES (0-100 scale)
Total companies scored: 100
Average score: 64.8/100
Score range: 0.0 - 100.0

Top 20 Companies for VC Investment:
Rank Company                                       Score    Raw    Narrative
---------------------------------------------------------------------------
1    ToolJet                                       100.0    9.13   0.675    
2    milvus                                        99.2     9.27   0.605    
3    novu                                          98.1     9.12   0.633    
4    appsmith                                      96.2     9.27   0.532    
5    CopilotKit                                    94.9     8.73   0.703    
6    plane                                         94.4     9.04   0.575    
7    LocalAI                                       94.2     9.00   0.588    
8    OpenBB                                        93.0     9.43   0.400    
9    twenty                                        92.5     9.05   0.525

In [None]:
# Export results to rankings.csv
print("EXPORTING RESULTS TO CSV...")
print("="*50)

rankings_df = df[['Name', 'normalized_score', 'Score_raw', 'narrative_score']].copy()

rankings_df = rankings_df.sort_values('normalized_score', ascending=False).reset_index(drop=True)

rankings_df['Rank'] = range(1, len(rankings_df) + 1)

rankings_df = rankings_df[['Rank', 'Name', 'normalized_score', 'Score_raw', 'narrative_score']]

rankings_df.columns = ['Rank', 'Name', 'Score', 'Raw Score', 'Narrative']

rankings_df['Score'] = rankings_df['Score'].round(2)
rankings_df['Raw Score'] = rankings_df['Raw Score'].round(4)
rankings_df['Narrative'] = rankings_df['Narrative'].round(3)

csv_filename = 'rankings.csv'
rankings_df.to_csv(csv_filename, index=False)

print(f"✅ Successfully exported {len(rankings_df)} companies to '{csv_filename}'")

print(f"   Average score: {rankings_df['Score'].mean():.1f}")

📊 EXPORTING RESULTS TO CSV...
✅ Successfully exported 100 companies to 'rankings.csv'
📁 File location: rankings.csv
📋 Columns: Rank, Name, Score, Raw Score, Narrative

📈 EXPORT SUMMARY:
   Total companies: 100
   Score range: 0.0 - 100.0
   Average score: 64.8

🔍 PREVIEW OF EXPORTED DATA:
 Rank                                     Name  Score  Raw Score  Narrative
    1       https://github.com/ToolJet/ToolJet 100.00     9.1296      0.675
    2      https://github.com/milvus-io/milvus  99.22     9.2689      0.605
    3           https://github.com/novuhq/novu  98.09     9.1221      0.633
    4  https://github.com/appsmithorg/appsmith  96.16     9.2689      0.532
    5 https://github.com/CopilotKit/CopilotKit  94.93     8.7299      0.702
    6       https://github.com/makeplane/plane  94.40     9.0409      0.575
    7     https://github.com/go-skynet/LocalAI  94.23     8.9961      0.588
    8 https://github.com/OpenBB-finance/OpenBB  93.04     9.4271      0.400
    9       https://github