In [None]:
# Install necessary packages
!pip install nltk
!pip install scikit-learn
!pip install python-louvain

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import itertools
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import time
from scipy.stats import chi2_contingency
import seaborn as sns
from itertools import combinations
import networkx as nx
import plotly.graph_objects as go
from wordcloud import WordCloud
from collections import Counter
import community.community_louvain as community_louvain

In [None]:
# Download NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
# Read all members csv
df = pd.read_excel("path/to/data.xlsx")

## Explore the dataset and preprocess

In [None]:
df.head()

In [None]:
df.info()

In [44]:
# Relevant fields for network analysis
fields_to_keep = [
    'linkedinProfileUrl',
    'fullName',
    'companyName',
    'linkedinJobTitle',
    'linkedinDescription',
    'location',
    'linkedinJobLocation',
    'companyIndustry',
    'linkedinHeadline',
    'linkedinSkillsLabel',
    'linkedinSchoolName'
]

# Keep only these columns
df = df[fields_to_keep].copy()

In [None]:
# Show the null value count of all columns
df.isnull().sum()

In [46]:
# Drop location nulls
df = df.dropna(subset=['location'])

# Drop job location column
df = df.drop(columns=['linkedinJobLocation'])

## Location transformation to country-level

In [48]:
def clean_location_text(loc):
    """
    This function cleans location text by removing emojis,
    special characters, and the term 'International'.
    """
    if pd.isna(loc):
        return ""

    loc = re.sub(r'[^\w\s,;-]', '', str(loc))  # Remove emojis and special characters
    loc = re.sub(r'\bInternational\b', '', loc, flags=re.IGNORECASE)

    return loc.strip()

df['location'] = df['location'].apply(clean_location_text)

In [50]:
geolocator = Nominatim(user_agent="culttech-thesis")

def get_country(location):
    if not location:
        return ""
    try:
        geo = geolocator.geocode(location, language='en', addressdetails=True, timeout=10)
        time.sleep(1)  # Respect API rate limit
        if geo and 'country' in geo.raw['address']:
            return geo.raw['address']['country']
    except GeocoderTimedOut:
        return ""
    return ""

df['location'] = df['location'].apply(get_country)

In [None]:
df['location'].head()

In [None]:
df['location'] = df['country']
df = df.drop(columns=['country'])

## Categorization

In [None]:
lemmatizer = WordNetLemmatizer()

stop_words = stopwords.words('english')
custom_phrases = ['microsoft excel', 'microsoft office', 'microsoft', 'microsoft powerpoint', 'powerpoint', 'office']
role_words = {
    'ceo', 'founder', 'cofounder', 'director', 'manager',
    'owner', 'partner', 'president', 'leader', 'head',
    'executive', 'chief', 'officer', 'consultant', 'staff',
    'managing', 'chairman', 'employee'}

def preprocess_text(text):
    """
    Applies a series of text preprocessing steps:
    1. Lowercasing
    2. URL removal
    3. Custom phrase removal
    4. Punctuation and number removal
    5. Tokenization
    6. Stop word removal
    7. Lemmatization
    """
    if not isinstance(text, str):
        return ""

    text = text.lower()

    # Remove URLs
    url_pattern = r'https?://\S+|www\.\S+|\S+\.(com|org|net|gov|edu|io|co|uk|de|fr|jp|cn|in|ru|br|au|ca|ch|es|it|nl|se|no|dk|fi|ie|nz|at|be|gr|hk|il|kr|mx|my|ph|sg|th|tw|vn|za)\S*'
    text = re.sub(url_pattern, '', text, flags=re.MULTILINE)

    # Remove custom phrases
    for phrase in custom_phrases:
        text = text.replace(phrase, '')

    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization, stop word removal, and lemmatization
    tokens = []
    if text.strip():
        for word in text.split():
            if word not in stop_words and word not in role_words:
              tokens.append(lemmatizer.lemmatize(word))

    return ' '.join(tokens)

## Determine which text field combination to use

In [None]:
# Define candidate text columns
candidate_text_columns = [
    'linkedinJobTitle',
    'linkedinDescription',
    'linkedinSkillsLabel',
    'linkedinHeadline',
    'companyIndustry'
]

# Iterate through combinations and evaluate clustering
results = []
max_k_to_test = 10

# Loop through combinations of all fields
for i in range(1, len(candidate_text_columns) + 1):
    for combo in itertools.combinations(candidate_text_columns, i):
        current_columns = list(combo)
        combo_name = " + ".join(current_columns)

        # Concatenate columns for the current combination
        df['current_text'] = df[current_columns].fillna('').agg(' '.join, axis=1)
        df['current_cleaned_text'] = df['current_text'].apply(preprocess_text)

        if df['current_cleaned_text'].str.strip().eq('').all():
            print(f"Skipping combination '{combo_name}' as cleaned text is empty.")
            continue

        # TF-IDF
        vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2), min_df=5)
        tfidf_matrix = vectorizer.fit_transform(df['current_cleaned_text'])

        best_silhouette = -1 # Silhouette scores range from -1 to 1
        best_wcss = float('inf')
        optimal_k = -1

        # Evaluate K from 2 to max_k_to_test
        for k in range(2, max_k_to_test + 1):
            if k > tfidf_matrix.shape[0]:
                break

            kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
            cluster_labels = kmeans.fit_predict(tfidf_matrix)

            # Check if there's only one cluster formed (can happen with very sparse data)
            if len(set(cluster_labels)) < 2:
                continue

            current_silhouette = silhouette_score(tfidf_matrix, cluster_labels)
            current_wcss = kmeans.inertia_

            if current_silhouette > best_silhouette:
                best_silhouette = current_silhouette
                best_wcss = current_wcss
                optimal_k = k

        if optimal_k != -1: # Only store results if at least one valid K was found
            results.append({
                'Combination': combo_name,
                'Optimal K': optimal_k,
                'Max silhouette score': best_silhouette,
                'WCSS at optimal K': best_wcss
            })

print("Text fields combination evaluation results")
results_df = pd.DataFrame(results).sort_values(by='Max silhouette score', ascending=False)

# Save results_df as CSV
results_df.to_csv("text_fields_combination_evaluation_results.csv", index=False)
print(results_df.to_markdown(index=False))

## Apply clustering to selected fields

In [None]:
# Prepare 'text' column
text_columns = [
    'linkedinDescription',
    'linkedinJobTitle',
    'companyIndustry',
]

# Combine into one text field
df['text'] = df[text_columns].fillna('').agg(' '.join, axis=1)

# Apply the preprocessing function to create the 'cleaned_text' column
df['cleaned_text'] = df['text'].apply(preprocess_text)

In [None]:
df['cleaned_text'].head()

In [None]:
# TF-IDF
# max_features limits the vocabulary size to the top 5000 most frequent terms
# ngram_range=(1,2) includes single words (unigrams) and two-word phrases (bigrams)
# min_df=5 ignores terms that appear in less than 5 documents

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2), min_df=5)
tfidf_matrix = vectorizer.fit_transform(df['cleaned_text'])

# Plot elbow method
wcss = []
max_k_for_plots = 15
for i in range(1, max_k_for_plots + 1):
    kmeans = KMeans(n_clusters=i, random_state=42, n_init='auto')
    kmeans.fit(tfidf_matrix)
    wcss.append(kmeans.inertia_) # inertia_ is the WCSS

plt.figure(figsize=(10, 6))
plt.plot(range(1, max_k_for_plots + 1), wcss, marker='o', linestyle='--')
plt.title('Elbow method for optimal K')
plt.xlabel('Number of clusters (K)')
plt.ylabel('Within-Cluster Sum of Squares (WCSS)')
plt.xticks(range(1, max_k_for_plots + 1))
plt.grid(True)
plt.show()

# Plot silhouette score
silhouette_scores = []
for i in range(2, max_k_for_plots + 1):
    kmeans = KMeans(n_clusters=i, random_state=42, n_init='auto')
    cluster_labels = kmeans.fit_predict(tfidf_matrix)

    # Check if there's only one cluster formed (can happen with very sparse data)
    if len(set(cluster_labels)) < 2:
        # Silhouette score is not defined for a single cluster
        silhouette_scores.append(0)
        continue

    score = silhouette_score(tfidf_matrix, cluster_labels)
    silhouette_scores.append(score)

plt.figure(figsize=(10, 6))
plt.plot(range(2, max_k_for_plots + 1), silhouette_scores, marker='o', linestyle='--')
plt.title('Silhouette score for optimal K')
plt.xlabel('Number of clusters (K)')
plt.ylabel('Silhouette score')
plt.xticks(range(2, max_k_for_plots + 1))
plt.grid(True)
plt.show()

In [None]:
# Final K-Means clustering with optimal K
n_clusters = 10

kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
clusters = kmeans.fit_predict(tfidf_matrix)

df['cluster'] = clusters

order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()

print("\nTop terms per cluster:")
for i in range(n_clusters):
    cluster_members_count = sum(df['cluster'] == i)
    print(f"\nCluster {i} (Members: {cluster_members_count}):")
    top_terms = [terms[ind] for ind in order_centroids[i, :15]]
    print(f"  Top terms: {', '.join(top_terms)}")

    # Generate word cloud for the current cluster
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(top_terms))

    # Display the word cloud
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f'Word Cloud for Cluster {i}')
    plt.axis('off')
    plt.show()

In [None]:
# Manually assign cluster labels
cluster_labels = {
    0: "Public sector & economic development",
    1: "Creative & design",
    2: "Management consulting",
    3: "Venture capital & investment",
    4: "Music & media production",
    5: "Finance, IT & business services",
    6: "Museums & cultural institutions",
    7: "Tech & software",
    8: "General startup & tech ecosystem",
    9: "Academia & higher education"
}

# Add labels to the DataFrame
df['cluster_label'] = df['cluster'].map(cluster_labels)

# View value counts of each cluster
print(df[['cluster', 'cluster_label']].value_counts())

In [None]:
# Plot the frequencies of cluster_label column
df['cluster_label'].value_counts().plot(kind='bar', figsize=(15,7), color='skyblue')

plt.title('Frequency of cluster labels')
plt.xlabel('Cluster label')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Generate a cleaner DataFrame to construct the network and assign node IDs

In [None]:
column_to_keep = [
    'companyName',
    'location',
    'linkedinSchoolName',
    'cluster_label'
    ]
df = df[column_to_keep]
df['node_id'] = ['S' + str(i + 1) for i in range(len(df))]

In [None]:
df.head()

In [None]:
nodes_df = df[['node_id', 'fullName', 'cluster_label', 'companyName', 'linkedinSchoolName', 'location']]
nodes_df.head()

# Save nodes
nodes_df.to_csv("nodes.csv", index=False)

## Calculate Cramer's V for categorical variables

In [None]:
categorical_cols = ['companyName', 'location', 'linkedinSchoolName', 'cluster_label']

# Function to compute Cramér's V for categorical variables
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    if confusion_matrix.shape[0] < 2 or confusion_matrix.shape[1] < 2:
        return np.nan
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k - 1)*(r - 1)) / (n - 1))
    rcorr = r - ((r - 1)**2) / (n - 1)
    kcorr = k - ((k - 1)**2) / (n - 1)
    return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))

# Create correlation matrix
correlation_matrix = pd.DataFrame(index=categorical_cols, columns=categorical_cols)

for col1 in categorical_cols:
    for col2 in categorical_cols:
        correlation_matrix.loc[col1, col2] = cramers_v(nodes_df[col1], nodes_df[col2])

correlation_matrix = correlation_matrix.astype(float)

# Plot heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Cramér's V correlation matrix for categorical attributes")
plt.tight_layout()
plt.show()

## Edge construction

In [None]:
edges = []
for a, b in combinations(nodes_df.itertuples(index=False), 2):
    strong_match = False

    if a.companyName == b.companyName and pd.notnull(a.companyName):
        strong_match = True
    elif a.linkedinSchoolName == b.linkedinSchoolName and pd.notnull(a.linkedinSchoolName):
        strong_match = True

    if strong_match:
        edges.append({'source': a.node_id, 'target': b.node_id})
    else:
        if a.cluster_label == b.cluster_label:
            if a.location == b.location and pd.notnull(a.location):
                edges.append({'source': a.node_id, 'target': b.node_id})  # Add soft edge

# Save edges
edges_df = pd.DataFrame(edges)
edges_df.to_csv("edges.csv", index=False)

## Network construction

In [None]:
# Create the graph
G = nx.Graph()

# Add nodes with attributes
for _, row in nodes_df.iterrows():
    G.add_node(
        row['node_id'],
        cluster_label=row['cluster_label'],
        companyName=row['companyName'],
        linkedinSchoolName=row['linkedinSchoolName'],
        location=row['location']
    )

# Add edges
for _, row in edges_df.iterrows():
    G.add_edge(row['source'], row['target'])

In [None]:
# Generate positions
pos = nx.spring_layout(G, seed=42)

# Assign positions to nodes
for node in G.nodes():
    G.nodes[node]['pos'] = pos[node]

In [None]:
# Create edge traces
edge_x = []
edge_y = []

for edge in G.edges():
    x0, y0 = G.nodes[edge[0]]['pos']
    x1, y1 = G.nodes[edge[1]]['pos']
    edge_x.extend([x0, x1, None])
    edge_y.extend([y0, y1, None])

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines'
)

In [None]:
# Map cluster labels to numeric codes for coloring
cluster_labels = [G.nodes[node]['cluster_label'] for node in G.nodes()]
label_to_int = {label: i for i, label in enumerate(sorted(set(cluster_labels)))}

# Create node traces
node_x = []
node_y = []
node_text = []
node_color = []

for node in G.nodes():
    x, y = G.nodes[node]['pos']
    node_x.append(x)
    node_y.append(y)
    cluster_label = G.nodes[node]['cluster_label']
    node_color.append(label_to_int[cluster_label])
    node_text.append(
        f"Node ID: {node}<br>"
        f"Company: {G.nodes[node]['companyName']}<br>"
        f"School: {G.nodes[node]['linkedinSchoolName']}<br>"
        f"Location: {G.nodes[node]['location']}<br>"
        f"Sector: {cluster_label}"
    )

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers',
    hoverinfo='text',
    text=node_text,
    marker=dict(
        size=10,
        color=node_color,
        colorscale='Viridis',
        colorbar=dict(
            title='Cluster Label (Encoded)'
        ),
        line=dict(width=1)
    )
)

In [None]:
# Plot
fig = go.Figure(data=[edge_trace, node_trace],
                layout=go.Layout(
                    title='Culttech members network',
                    titlefont_size=16,
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=20, l=20, r=20, t=40),
                    xaxis=dict(
                        showgrid=False,
                        zeroline=False,
                        #scaleanchor='y',
                        #scaleratio=1
                    ),
                    yaxis=dict(
                        showgrid=False,
                        zeroline=False
                    )
                ))


fig.show()

## Calculate node-level network measures

In [None]:
# Degree centrality
degree_centrality = nx.degree_centrality(G)

# Betweenness centrality
betweenness_centrality = nx.betweenness_centrality(G)

# Closeness centrality
closeness_centrality = nx.closeness_centrality(G)

# Eigenvector centrality
eigenvector_centrality = nx.eigenvector_centrality(G, max_iter=1000)

In [None]:
centrality_df = pd.DataFrame({
    'node_id': list(degree_centrality.keys()),
    'degree_centrality': list(degree_centrality.values()),
    'betweenness_centrality': list(betweenness_centrality.values()),
    'closeness_centrality': list(closeness_centrality.values()),
    'eigenvector_centrality': list(eigenvector_centrality.values())
})

centrality_df = centrality_df.merge(nodes_df, on='node_id')

top_degree = centrality_df.nlargest(10, 'degree_centrality')
top_betweenness = centrality_df.nlargest(10, 'betweenness_centrality')
top_closeness = centrality_df.nlargest(10, 'closeness_centrality')
top_eigenvector = centrality_df.nlargest(10, 'eigenvector_centrality')

print("Top 10 by Degree Centrality:\n", top_degree[['node_id', 'degree_centrality']].reset_index(drop=True))
print("\nTop 10 by Betweenness Centrality:\n", top_betweenness[['node_id', 'betweenness_centrality']].reset_index(drop=True))
print("\nTop 10 by Closeness Centrality:\n", top_closeness[['node_id', 'closeness_centrality']].reset_index(drop=True))
print("\nTop 10 by Eigenvector Centrality:\n", top_eigenvector[['node_id', 'eigenvector_centrality']].reset_index(drop=True))

# Save results to CSV
top_degree.to_csv("top_degree_centrality.csv", index=False)
top_betweenness.to_csv("top_betweenness_centrality.csv", index=False)
top_closeness.to_csv("top_closeness_centrality.csv", index=False)
top_eigenvector.to_csv("top_eigenvector_centrality.csv", index=False)

## Network-level measures

In [None]:
# Number of nodes and edges
num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()

# Network density
density = nx.density(G)

# Largest connected component
if nx.is_connected(G):
    diameter = nx.diameter(G)
    avg_path_length = nx.average_shortest_path_length(G)
else:
    largest_cc_nodes = max(nx.connected_components(G), key=len)
    largest_cc = G.subgraph(largest_cc_nodes).copy()
    diameter = nx.diameter(largest_cc)
    avg_path_length = nx.average_shortest_path_length(largest_cc)

# Average clustering coefficient
avg_clustering = nx.average_clustering(G)

print("Network-level measures")
print(f"Number of nodes: {num_nodes}")
print(f"Number of edges: {num_edges}")
print(f"Density: {density:.4f}")
print(f"Diameter (largest connected component): {diameter}")
print(f"Average clustering coefficient: {avg_clustering:.4f}")
print(f"Average shortest path length (LCC): {avg_path_length:.4f}")

## Apply Louvain community detection algorithm

In [None]:
partition = community_louvain.best_partition(G)

# Add community info to graph nodes
nx.set_node_attributes(G, partition, 'louvain_community')

In [None]:
modularity_score = community_louvain.modularity(partition, G)
print(f"Modularity: {modularity_score:.4f}")

In [None]:
num_communities = len(set(partition.values()))
print(f"Number of detected communities: {num_communities}")

In [None]:
community_sizes = Counter(partition.values())
print("Community sizes:", community_sizes)