In [16]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch

In [18]:
data = pd.read_csv('startup_fate/startup_train.csv')
data = data.drop(columns=['index'])
data = data.fillna('')
print(data.shape)
data.head()

(5512, 13)


Unnamed: 0,name,overview,category_code,tag_list,country_code,num_prev_rounds,has_raised_amount,ln_raised_amount,participants,previous_any_founder_experience,ipo_prob,ma_prob,has_next_round
0,Stackdriver,Stackdriver provides a [powerfully simple moni...,enterprise,"application-management, cloud-monitoring, moni...",USA,1,1,15.424948,1,0,0.02439,0.182927,1
1,Authix Tecnologies,Authix Tecnologies is a Torino based start-up ...,security,authentication-solution,GRC,1,1,13.340052,1,0,0.0,0.0,1
2,Lytics,Lytics provides B2C marketers the first analyt...,software,"analytics, big-data, data-science, bigdata",USA,2,1,14.603968,5,0,0.021505,0.242105,0
3,1World Online,1World Online is a Silicon Valley-based startu...,enterprise,"social-research, big-data, analytics, mobile",USA,1,1,13.815511,1,0,0.0,0.0,0
4,Enure Networks,"Enure Networks, Ltd. provides home-network man...",software,,ISR,1,1,15.894952,2,0,0.0,0.0,1


In [19]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('Databook/db-TinyBERT')
model = BertModel.from_pretrained("Databook/db-TinyBERT")
custom_text = "You are welcome to utilize any text of your choice."
encoded_input = tokenizer(custom_text, return_tensors='pt', padding=True, truncation=True, max_length=512)
output_embeddings = model(**encoded_input)

In [20]:
embedding = model(**encoded_input).last_hidden_state.cpu().detach().squeeze().numpy()
embedding.sum(axis=0).shape

(128,)

In [21]:
# model = BertModel.from_pretrained("Databook/db-TinyBERT")
# model.to(torch.device('cuda'))
# tokenizer = BertTokenizer.from_pretrained('Databook/db-TinyBERT')

model = BertModel.from_pretrained("bert-base-cased")
model.to(torch.device('cuda'))
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')


def bert_embeddings(data: pd.DataFrame, col: str):
    embeddings = []
    tokenized_texts = tokenizer(data[col].to_list(), return_tensors='pt', padding=True, truncation=True, max_length=512)
    for i in tqdm(range(tokenized_texts['input_ids'].shape[0])):
        tokenized = {key: val[[i]].to(torch.device('cuda')) for key, val in tokenized_texts.items()}
        embedding = model(**tokenized).last_hidden_state.cpu().detach().squeeze().numpy()
        embedding = embedding.sum(axis=0)
        embeddings.append(embedding)
    return np.array(embeddings)

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

def vectorize_col(data: pd.DataFrame, col: str, vectorizer=None):
    texts_samples = data[col].to_list()
    if not vectorizer:
        vectorizer = TfidfVectorizer(stop_words='english', max_features=800, ngram_range=(1, 2))
    tf_idf = vectorizer.fit_transform(texts_samples)
    return tf_idf, vectorizer


In [23]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler


text = ['name', 'overview', 'tag_list']
categorical = ['category_code', 'country_code']
numeric = ['num_prev_rounds', 'ln_raised_amount', 'participants', 'ipo_prob', 'ma_prob']
other = ['has_raised_amount', 'previous_any_founder_experience']

In [24]:
vectors_list = []
for col in text:
    vectorized_texts = bert_embeddings(data, col)
    vectors_list.append(vectorized_texts)

column_transformer = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical),
    # ('scaling', StandardScaler(), numeric),
    ('other',  'passthrough', other)
])

target = data['has_next_round'].to_numpy()
features = column_transformer.fit_transform(data).toarray()
features = np.concatenate([features, *vectors_list], axis=1)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5512/5512 [01:20<00:00, 68.44it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5512/5512 [01:39<00:00, 55.50it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5512/5512 [01:16<00:00, 71.88it/s]


In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, target, train_size=0.8, random_state=3)

In [26]:
from sklearn.metrics import accuracy_score

def calc_score(true, pred):
    return 32 * (accuracy_score(true, pred) - 0.5)

In [27]:
from catboost import CatBoostClassifier

catboost_features = {
    'iterations': 1000,
    'random_seed': 1,
    'eval_metric': 'Accuracy',
    'verbose': 100,
    # 'task_type': 'GPU',
}
model = CatBoostClassifier(
    **catboost_features
)
model.fit(X_train, y_train, eval_set=(X_test, y_test))
pred = model.predict(X_test)
calc_score(y_test, pred)

Learning rate set to 0.045715
0:	learn: 0.6048991	test: 0.5793291	best: 0.5793291 (0)	total: 321ms	remaining: 5m 21s
100:	learn: 0.7827172	test: 0.6137806	best: 0.6264733 (11)	total: 21.9s	remaining: 3m 14s
200:	learn: 0.8772964	test: 0.6137806	best: 0.6264733 (11)	total: 43.4s	remaining: 2m 52s
300:	learn: 0.9614425	test: 0.5965549	best: 0.6264733 (11)	total: 1m 4s	remaining: 2m 30s
400:	learn: 0.9879791	test: 0.5947416	best: 0.6264733 (11)	total: 1m 26s	remaining: 2m 9s
500:	learn: 0.9984123	test: 0.6029012	best: 0.6264733 (11)	total: 1m 48s	remaining: 1m 47s
600:	learn: 0.9995464	test: 0.5965549	best: 0.6264733 (11)	total: 2m 9s	remaining: 1m 26s
700:	learn: 1.0000000	test: 0.5865820	best: 0.6264733 (11)	total: 2m 31s	remaining: 1m 4s


KeyboardInterrupt: 