In [1]:
import pandas as pd
from config import PATHS

#data before cleaning: 
df = pd.read_json(PATHS['raw_data'], encoding='utf-8', orient='records')
print("columns: ", df.columns.tolist())
print("rows before cleaning: ", df.shape[0])

df.replace(['N/A', "", None], 'unknown', inplace=True)
df.fillna('unknown', inplace=True)

print(f"name has : {df[(df.name == 'N/A') | (df.name == '') | (df.name.isna())].shape[0] } missing values")


columns:  ['name', 'sector', 'technologies', 'city', 'description']
rows before cleaning:  324
name has : 0 missing values


We should avoid dropping any record just because one feature is missing, the only ones to drop are those with all features missing. 
The reason is that we want all startups to be included, dropping some means less couverage.

So there's actually no row to drop. I will see what combinations of features are missing, after that I will use present features to generate text that will be used in text embedding. (the most import feature for this is obviously 'description')

In [2]:
from itertools import combinations

features = df.columns.tolist()
features.remove('name')  # Exclude 'name' from combinations since it's not a feature
def missing_combinations(): 
    for p in range(1, len(features)):
        combos = [list(uplet) for uplet in combinations(features, p)]
        for combo in combos:
            def all_missing(row): return all(row[col] == 'unknown' for col in combo)
            print(df[df.apply(all_missing, axis=1)].shape[0], f"rows with {combo} missing")
        
        print("\n")
        


missing_combinations()
    

1 rows with ['sector'] missing
45 rows with ['technologies'] missing
1 rows with ['city'] missing
6 rows with ['description'] missing


1 rows with ['sector', 'technologies'] missing
0 rows with ['sector', 'city'] missing
0 rows with ['sector', 'description'] missing
0 rows with ['technologies', 'city'] missing
2 rows with ['technologies', 'description'] missing
0 rows with ['city', 'description'] missing


0 rows with ['sector', 'technologies', 'city'] missing
0 rows with ['sector', 'technologies', 'description'] missing
0 rows with ['sector', 'city', 'description'] missing
0 rows with ['technologies', 'city', 'description'] missing




(6 rows with ['description'] missing) this this the most important thing to fix. we see from results above that Alhamdulillah when 'description' is missing, ALL the other features are present 92% of the time, and MOST of them are present 100% of the time. So I will try to generate a description based on present features when it's missing.

In [3]:
#preparing data for text embedding
import re

#print(embedding_df[embedding_df['description'] == 'unknown'])
def normalize_text(text):
    return text.strip().lower().replace("’", "'") if isinstance(text, str) else text.str.strip().lower()

for col in df.columns:
    df[col] = df[col].map(normalize_text)

cleaned_names =[(name.replace("’", "'").replace(",", "").replace(";", "").replace(":", "")
                      .replace("/", "").replace("\\", "").replace("-", "").replace(".", "").strip().lower()) for name in df.name]
df.index = cleaned_names
df.drop(columns=['name'], inplace=True)

#remove startups names from description
def remove_name_from_description(df):
    for name in df.index:
        description = df.loc[name, 'description']
        for word in name.split():
            word_pattern = rf"\b{re.escape(word)}\b"  
            description = re.sub(word_pattern, '', description)

        description = re.sub(r'\s+', ' ', description)                     # multiple spaces to one space
        description = re.sub(r'\s*,\s*', ', ', description)               # clean commas
        description = re.sub(r',\s*,', ',', description)                  # handle multiple commas
        description = re.sub(r'\s*\.\s*', '. ', description)              # clean periods
        df.loc[name, 'description'] = description.strip().lower()
    
    return df

df = remove_name_from_description(df)

#remove abbreviation from description
def remove_abbreviation(row):
    words = row.name.split()
    to_remove = ['de', 'des', 'le', 'la', 'les', 'du', 'd\'', 'l\'', 'un', 'une', 'et', '&']
    for rem in to_remove: 
        if rem in words: words.remove(rem)
        
    abbr = "".join([word[0] for word in words])
    abbr = re.escape(abbr)
    pattern = rf"\(*\s*{abbr}\s*\)*"
    return re.sub(pattern, "", str(row['description'])).strip()

df['description'] = df.apply(remove_abbreviation, axis=1)





In [4]:
#generate missing description based on present 

def generate_description(row):
    if row['description'] != 'unknown':
        return row['description']
    else: 
        return f"a startup operating in {row['sector']} using {row['technologies']} based in {row['city']}"

df['generated_description'] = df.apply(generate_description, axis=1)

#concatenate features for text embedding 
def concatenate_features(row):
    features = []
    if row['description'] != 'unknown':
        features.append(f"business description: {row['description']}")
        features.append(f"industry sectors: {row['sector'].replace(';', ' ').replace("/", ' ')}")
        features.append(f"technologies used: {row['technologies'].replace(';', ' ').replace("/", ' ')}")
        features.append(f"located in: {row['city']}")
        return ' '.join(features)
    
    return row['generated_description'] #because the generated description is already a concatenation of features

df['text_embedding'] = df.apply(concatenate_features, axis=1)

print("text_embedding column created with concatenated features for text embedding.")



text_embedding column created with concatenated features for text embedding.


In [5]:
df.reset_index(inplace=True)  # make names a column again

df.to_csv(PATHS["embedding_data"], index=False, encoding='utf-8') #index=False to save names as a column
print("Data saved to", PATHS['embedding_data'])


Data saved to C:\Users\zakar\OneDrive\Bureau\projects\techatlas\data\embedding_data
