-
Notifications
You must be signed in to change notification settings - Fork 1
/
vc_matcher.py
160 lines (136 loc) · 4.02 KB
/
vc_matcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
from nltk import *
import sys
import csv
import pickle
import datetime
import re
import json
from models.company import *
import datetime
def gen_features(startup_ngrams, label=None):
"""
Generates a feature set from ngrams
args:
startup_ngrams: ngrams in the startup
label: 1 if it has been invested in the VC being trained for,
0 if not, None if the featureset is a test set
returns:
feature_list: returns a featureset NLTK can classify based on
"""
feature_list = []
for ngrams in startup_ngrams:
for ngram in ngrams:
ngram = tuple([x.lower() for x in ngram])
feature_dic= {}
feature_dic['contains %s' %(ngram,)] = True
if label == True:
feature_list.append((feature_dic,1))
elif label == False:
feature_list.append((feature_dic, 0))
else:
feature_list.append((feature_dic))
return feature_list
def gen_word_features(startup_ngrams, mode='train'):
feature_list = []
for ngrams in startup_ngrams:
for ngram in ngrams:
for word in ngram:
feature_dic= {}
feature_dic['contains (%s)' %(word.lower())] = True
if mode == 'train':
feature_list.append((feature_dic,1))
else:
feature_list.append((feature_dic))
return feature_list
def build_model(yes_cograms, no_cograms):
"""
Builds a naive bayes model based on no_cograms
args:
yes_cograms: cograms corresponding to the VC
no_cograms: cograms not corresponding to the VC
returns:
pickled_classifier: a pickled naive bayes classifier
"""
feature_set = gen_features(no_cograms, label=False)
feature_set = feature_set + gen_features(yes_cograms, label=True)
classifier = NaiveBayesClassifier.train(feature_set)
pickled_classifier = pickle.dumps(classifier)
return pickled_classifier
def test(model, text):
"""
Takes a VC model and the user given text and returns probability of the VC investing in that company
args:
model: NaiveBayesClassifier
text: description of startup from user
returns:
accuracy: percent accuracy of model
"""
new_ngrams = build_ngrams(text)
new_features = gen_features(new_ngrams)
return model.prob_classify(new_features[0]).prob(1)
def build_ngrams(text):
"""
Convert text to bigrams and trigrams
args:
text: text to convert
returns:
ngrams: a list of bigrams and trigrams
"""
# remove punctuation
text = re.findall("\w+", text)
text_bigrams = [x for x in bigrams(text)]
text_trigrams = [x for x in trigrams(text)]
ngrams = text_bigrams + text_trigrams
return [ngrams]
def match_vc(text, page):
"""
Returns venture capitalist suggestions for a text description of a startup.
args:
text: text description of a startup
page: what range of crunchbase VC entries to search
returns:
vc_holder: JSON holding the VC name, url, and percent match
"""
ngrams = build_ngrams(text)
new_features = gen_features(ngrams)
lower_query_bound = str((page-1) * 50)
upper_query_bound = str(page * 50)
if int(upper_query_bound) > 438:
upper_query_bound = str(438)
print 'begin query at:'
print datetime.datetime.now()
# query database for VCs
vcs = db.session.query(VC).filter("id >="+ lower_query_bound +" AND id <="+upper_query_bound).all()
vc_names = []
vc_urls = []
percent = []
# get results
for vc in vcs:
print 'inside get results'
print datetime.datetime.now()
vc_name = vc.name
vc_url = vc.url
print 'vc_name %s' %(vc_name)
print 'vc_url %s' %(vc_url)
vc_model = pickle.loads(vc.nb_model)
result = vc_model.prob_classify(new_features[0]).prob(1)
print 'result:'
print result
# how much of a match must it be to be returned
threshold = .75
if result >= threshold:
percent.append(result)
vc_names.append(vc_name)
vc_urls.append(vc_url)
vc_holder = {"listItems":[]}
# format to json
for i in range(0,len(vc_names)):
json_dic = {
'name': vc_names[i],
'url': vc_urls[i],
'result': percent[i]
}
vc_holder['listItems'].append(json_dic)
print 'done making page'
print datetime.datetime.now()
return vc_holder