forked from codecombat/codecombat
-
Notifications
You must be signed in to change notification settings - Fork 0
/
buildSchoolGraph.coffee
381 lines (344 loc) · 17.5 KB
/
buildSchoolGraph.coffee
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
# Organize our users' schoolNames.
database = require '../server/commons/database'
mongoose = require 'mongoose'
log = require 'winston'
async = require 'async'
moment = require 'moment'
fs = require 'fs'
exec = require('child_process').exec
### SET UP ###
do (setupLodash = this) ->
GLOBAL._ = require 'lodash'
_.str = require 'underscore.string'
_.mixin _.str.exports()
GLOBAL.tv4 = require('tv4').tv4
database.connect()
UserHandler = require '../server/users/user_handler'
User = require '../server/users/User'
startDate = new Date 2015, 11, 1
debugging = false
query = dateCreated: {$gt: startDate}, emailLower: {$exists: true}
selection = 'name emailLower schoolName courseInstances clans ageRange dateCreated referrer points lastIP hourOfCode preferredLanguage lastLevel'
User.find(query).select(selection).lean().exec (err, users) ->
usersWithSchools = _.filter users, 'schoolName'
log.info "Found #{usersWithSchools.length} users of #{users.length} users registered after #{startDate}."
nextPrompt users
nextPrompt = (users, question, userToSchool, suggestions) ->
# We look for the next top user to classify based on the number of suggestions we can make about what the school name should be.
sortUsers users
unless userToSchool
return console.log('Done.') or process.exit() unless [userToSchool, suggestions] = findUserToSchool users
question ?= formatSuggestions userToSchool, suggestions
openTSV userToSchool, suggestions
prompt question, (answer) ->
answer = answer.trim()
return console.log('Bye.') or process.exit() if answer in ['q', 'quit']
if answer is ''
return nextPrompt _.without users, userToSchool
else unless _.isNaN(num = parseInt(answer, 10))
schoolName = if num then suggestions[num - 1]?.schoolName else userToSchool.schoolName
return finalizePrompt userToSchool, suggestions, schoolName, users
else if answer.length < 10
console.log "#{answer}? That's kind of short--I don't think school names and locations can be this short. What should it really be?"
return nextPrompt users, "> ", userToSchool, suggestions
else unless /,.+,/.test answer
console.log "#{answer}? We need the full location (with two commas), like Example High School, Springfield, IL. What should it really be?"
return nextPrompt users, "> ", userToSchool, suggestions
else
return finalizePrompt userToSchool, suggestions, answer, users
finalizePrompt = (userToSchool, suggestions, schoolName, users) ->
console.log "Selected schoolName: \"#{schoolName}\""
question = "Also apply this to other users? Ex.: 'all', '0 1 2 5 9-14', 'all but 38 59-65', '0' to just do this one, q to quit, or blank to retype school name.\n> "
prompt question, (answer) ->
answer = answer.trim()
return console.log('Bye.') or process.exit() if answer in ['q', 'quit']
if answer is ''
console.log "Should just do", userToSchool._id, userToSchool.emailLower, userToSchool.schoolName
targets = [userToSchool]
else if answer is 'all'
targets = [userToSchool].concat (s.user for s in suggestions)
console.log "Doing all #{targets.length} users..."
else if /^all/.test answer
numbers = findNumbers answer, suggestions.length
targets = [userToSchool].concat (s.user for s in suggestions)
for number in numbers
skip = if number then suggestions[number - 1].user else userToSchool
targets = _.without targets, skip
console.log "Doing all #{targets.length} users without #{numbers}..."
else
numbers = findNumbers answer, suggestions.length
targets = _.filter ((if number then suggestions[number - 1].user else userToSchool) for number in numbers)
console.log "Doing #{targets.length} users for #{numbers}..."
User.update {_id: {$in: (_.map targets, '_id')}}, {schoolName: schoolName}, {multi: true}, (err, result) ->
#User.update {_id: {$in: []}}, {schoolName: schoolName}, {multi: true}, (err, result) ->
if err
console.error "Ran into error doing the save:", err
return finalizePrompt userToSchool, suggestions, schoolName, users
console.log "Updated users' schoolNames. Result:", result
# Take these users out of the pool to make suggestions about before going on to next suggestions.
remainingUsers = _.without users, targets...
nextPrompt remainingUsers
findNumbers = (answer, max) ->
answer = answer.replace /,/g, ' '
numbers = (parseInt(d, 10) for d in (' ' + answer + ' ').replace(/ /g, ' ').match(/ (\d+) /g) ? [])
ranges = answer.match(/(\d+-\d+)/g) or []
for range in ranges
bounds = (parseInt(d, 10) for d in range.split('-'))
for number in [bounds[0] .. bounds[1]]
numbers.push number
for number in numbers
if number > max
console.log "Incorrect number #{number} higher than max: #{max}"
numbers
formatUser = (user, relativeToUser, separator=' ') ->
values = []
for key in ['name', 'emailLower', 'ageRange', 'dateCreated', 'lastLevel', 'points', 'referrer', 'hourOfCode']
val = user[key]
if key is 'dateCreated'
val = if relativeToUser then moment(val).from(relativeToUser.dateCreated) else moment(val).fromNow()
values.push val
values.join separator
formatSuggestions = (userToSchool, suggestions) ->
suggestionPrompts = ("#{_.str.rpad(i + 1, 3)} #{_.str.rpad(s.schoolName, 50)} #{s.reasons.length} #{if s.reasons.length > 1 then 'Matches' else 'Match'}: #{s.reasons.join(', ')}\tfrom user: #{formatUser(s.user, userToSchool)}" for s, i in suggestions).join('\n')
"""
What should the school for this user be?
0 #{_.str.rpad(userToSchool.schoolName, 50)} #{formatUser(userToSchool)}
Suggestions:
#{suggestionPrompts}
Choose a number, type a name, enter to skip, or q to quit.
> """
openTSV = (userToSchool, suggestions) ->
header = ['#', 'School Name', 'Matches', 'Name', 'Email', 'Age', 'Signup', 'Last Level', 'Points', 'Referrer', 'HoC'].join '\t'
rows = [[0, userToSchool.schoolName, '', formatUser(userToSchool, null, '\t')].join '\t']
for s, i in suggestions
matches = s.reasons.length + ' ' + if s.reasons.length > 1 then 'Matches' else 'Match' + ': ' + s.reasons.join(', ')
rows.push [i + 1, s.schoolName, matches, formatUser(s.user, userToSchool, '\t')].join '\t'
contents = [header].concat(rows).join('\n') + '\n'
path = "#{process.env.HOME}/Downloads/#{userToSchool.emailLower}.tsv"
fs.writeFile path, contents, {flags: 'w'}, (err) ->
console.log 'Error writing school suggestions TSV:', err if err
exec "open -a /Applications/Numbers.app #{path}"
checkedTopGroups = {}
findUserToSchool = (users) ->
# We find the top user from the top group that we can make the most reasoned suggestions about what the school name would be.
[bestTarget, bestTargetSuggestions, bestSuggestionsScore, bestGroup] = [null, [], 0, null]
for field, groups of topGroups
for nextLargestGroup in groups when not checkedTopGroups[nextLargestGroup]
possibleTargets = userCategories[field][nextLargestGroup]
schoolNames = (t.schoolName for t in _.uniq possibleTargets, 'schoolName')
# TODO: better method to avoid showing users where everyone in the suggestion already has the same school (because we have already done this group)
alreadyDone = false
for schoolName in schoolNames when schoolName?.length > 10 and /,.+,/.test schoolName # Long enough school name with location info (two commas)
sharedCount = _.filter(possibleTargets, schoolName: schoolName).length
if sharedCount > 20 and sharedCount > 0.25 * possibleTargets.length
console.log 'Already done', schoolName, sharedCount, possibleTargets.length, 'for', field, nextLargestGroup
alreadyDone = true
continue if alreadyDone
nSamples = Math.min 15, Math.max(4, Math.floor possibleTargets.length / 20)
if debugging then console.log 'Checking', nSamples, 'samples of', possibleTargets.length, 'players in the biggest', field, 'group:', nextLargestGroup
for i in [0 ... nSamples]
target = possibleTargets[Math.floor i * possibleTargets.length / (nSamples + 1)]
suggestions = findSuggestions target
suggestionsScore = scoreSuggestions suggestions, target
if suggestionsScore > bestSuggestionsScore
bestTarget = target
bestTargetSuggestions = suggestions
bestSuggestionsScore = suggestionsScore
bestGroup = nextLargestGroup
break
checkedTopGroups[bestGroup] = true
return [bestTarget, bestTargetSuggestions]
findSuggestions = (target) ->
# Look for other users with the same IP, course instances, clans, or similar school names or non-common shared email domains.
# TODO: Actually make suggestions based on students that signed up at almost the same time
suggestions = []
t0 = new Date()
if debugging then console.log ' Checking suggestions for', target.emailLower, target.schoolName, (new Date()) - t0
if target.lastIP
for otherUser in (userCategories.lastIP[target.lastIP] ? []) when otherUser isnt target
suggestions.push schoolName: otherUser.schoolName, reasons: ['IP'], user: otherUser
for leagueType in ['courseInstances', 'clans']
if debugging then console.log ' Now checking', leagueType, (new Date()) - t0
if target[leagueType]?.length
for league in target[leagueType]
for otherUser in (userCategories[leagueType][league] ? []) when otherUser isnt target
reason = _.str.humanize(leagueType)
if existingSuggestion = _.find(suggestions, user: otherUser)
existingSuggestion.reasons.push reason
else
suggestions.push schoolName: otherUser.schoolName, reasons: [reason], user: otherUser
if target.schoolName?.length > 5
if debugging then console.log ' Now checking schoolName', (new Date()) - t0
nameMatches = []
for otherSchoolName in topGroups.schoolName
score = stringScore otherSchoolName, target.schoolName, 0.8
continue if score < 0.25
nameMatches.push schoolName: otherSchoolName, score: score
nameMatches = (match.schoolName for match in (_.sortBy nameMatches, (match) -> -match.score))
for match in nameMatches.slice(0, 10)
reason = "Name"
for otherUser in (userCategories.schoolName[match] ? []) when otherUser isnt target
if existingSuggestion = _.find(suggestions, user: otherUser)
existingSuggestion.reasons.push reason
else
suggestions.push schoolName: match, reasons: [reason], user: otherUser
if debugging then console.log ' Now checking domain', (new Date()) - t0
if domain = getDomain target
for otherUser in (userCategories.domain[domain] ? []) when otherUser isnt target
reason = "Domain"
if existingSuggestion = _.find(suggestions, user: otherUser)
existingSuggestion.reasons.push reason
else
suggestions.push schoolName: otherUser.schoolName, reasons: [reason], user: otherUser
if debugging then console.log ' Now checking referrer', (new Date()) - t0
if referrer = getReferrer target
for otherUser in (userCategories.referrer[referrer] ? []) when otherUser isnt target
reason = "Referrer"
if existingSuggestion = _.find(suggestions, user: otherUser)
existingSuggestion.reasons.push reason
else
suggestions.push schoolName: otherUser.schoolName, reasons: [reason], user: otherUser
if debugging then console.log ' Done checking referrer', (new Date()) - t0
suggestions = _.sortBy suggestions, (s) -> (s.schoolName or '').toLowerCase()
suggestions = _.sortBy suggestions, (s) -> -scoreSuggestions [s], target
return suggestions
scoreSuggestions = (suggestions, target) ->
_.reduce suggestions, ((sum, suggestion) ->
for suggestion in suggestions
for reason in suggestion.reasons
sum += switch reason
when 'Course instances' then 150
when 'IP' then 40
when 'Referrer' then 20
when 'Name' then 15
when 'Domain' then (if getDomain(target) in ['cps.edu', 'mynewcaneyisd.org', 'fsusd.org', 'edison.k12.nj.us'] then 1 else 10)
when 'Clans' then 0.01
sum
), 0
userCategories = {}
topGroups = {}
usersCategorized = {}
sortUsers = (users) ->
users = _.sortBy users, (u) -> -u.points
users = _.sortBy users, 'lastIP'
users = _.sortBy users, (u) -> (u.schoolName or '').toLowerCase()
for field in ['courseInstances', 'lastIP', 'schoolName', 'domain', 'clans', 'referrer']
userCategories[field] = categorizeUsers users, field
topGroups[field] = _.sortBy _.keys(userCategories[field]), (key) -> -userCategories[field][key].length
topGroups[field] = (group for group in topGroups[field] when 2 < userCategories[field][group].length < (if field is 'clans' then 30 else 5000))
categorizeUsers = (users, field) ->
categories = {}
for user in users
if field is 'domain'
value = getDomain user
else if field is 'referrer'
value = getReferrer user
else
value = user[field]
continue unless value
values = if _.isArray(value) then value else [value]
for value in values when value
continue if value.trim and not value = value.trim()
categories[value] ?= []
categories[value].push user
categories
typoCache = {}
getDomain = (user) ->
return null unless domain = user.emailLower.split('@')[1]
return null if commonEmailDomainMap[domain]
# Too slow? Is this actually slow?
#typo = typoCache[domain]
#return null if typo
#return domain if typo is false
#typo = _.find commonEmailDomains, (commonDomain) -> stringScore(commonDomain, domain, 0.8) > 0.9
#typoCache[domain] = Boolean(typo)
#return null if typo
domain
commonReferrersRegex = /(google|bing\.|yahoo|duckduckgo|jobs\.lever|code\.org|twitter|facebook|dollarclick|stumbleupon|vk\.com|playpcesor|reddit|lifehacker|favorite|bnext|freelance|taringa|blogthinkbig|graphism|inside\.com|korben|habrahabr|iplaysoft|geekbrains|playground|ycombinator|github)/
getReferrer = (user) ->
return null unless referrer = user.referrer?.toLowerCase().trim()
referrer = referrer.replace /^https?:\/\//, ''
return null if commonReferrersRegex.test referrer
return classCode if classCode = referrer.match(/\?_cc=(\S+)$/)?[1]
return null if /codecombat/.test referrer
referrer
# https://github.com/joshaven/string_score
stringScore = (_a, word, fuzziness) ->
return 1 if word is _a
return 0 if word is ""
runningScore = 0
string = _a
lString = string.toLowerCase()
strLength = string.length
lWord = word.toLowerCase()
wordLength = word.length
startAt = 0
fuzzies = 1
if fuzziness
fuzzyFactor = 1 - fuzziness
if fuzziness
for i in [0...wordLength]
idxOf = lString.indexOf lWord[i], startAt
if idxOf is -1
fuzzies += fuzzyFactor
else
if startAt is idxOf
charScore = 0.7
else
charScore = 0.1
charScore += 0.8 if string[idxOf - 1] is ' '
charScore += 0.1 if string[idxOf] is word[i]
runningScore += charScore
startAt = idxOf + 1
else
for i in [0...wordLength]
idxOf = lString.indexOf lWord[i], startAt
return 0 if idxOf is -1
if startAt is idxOf
charScore = 0.7
else
charScore = 0.1
charScore += 0.8 if string[idxOf - 1] is word[i]
runningScore += charScore
startAt = idxOf + 1
finalScore = 0.5 * (runningScore / strLength + runningScore / wordLength) / fuzzies
finalScore += 0.15 if lWord[0] is lString[0] and finalScore < 0.85
finalScore
prompt = (question, callback) ->
process.stdin.resume()
process.stdout.write question
process.stdin.once 'data', (data) ->
callback data.toString().trim()
# https://github.com/mailcheck/mailcheck/wiki/List-of-Popular-Domains
commonEmailDomains = [
# Default domains included
"aol.com", "att.net", "comcast.net", "facebook.com", "gmail.com", "gmx.com", "googlemail.com",
"google.com", "hotmail.com", "hotmail.co.uk", "mac.com", "me.com", "mail.com", "msn.com",
"live.com", "sbcglobal.net", "verizon.net", "yahoo.com", "yahoo.co.uk",
# Other global domains
"email.com", "games.com", "gmx.net", "hush.com", "hushmail.com", "icloud.com", "inbox.com",
"lavabit.com", "love.com", "outlook.com", "pobox.com", "rocketmail.com",
"safe-mail.net", "wow.com", "ygm.com", "ymail.com", "zoho.com", "fastmail.fm",
# United States ISP domains
"bellsouth.net", "charter.net", "comcast.net", "cox.net", "earthlink.net", "juno.com",
# British ISP domains
"btinternet.com", "virginmedia.com", "blueyonder.co.uk", "freeserve.co.uk", "live.co.uk",
"ntlworld.com", "o2.co.uk", "orange.net", "sky.com", "talktalk.co.uk", "tiscali.co.uk",
"virgin.net", "wanadoo.co.uk", "bt.com",
# Domains used in Asia
"sina.com", "qq.com", "naver.com", "hanmail.net", "daum.net", "nate.com", "yahoo.co.jp", "yahoo.co.kr", "yahoo.co.id", "yahoo.co.in", "yahoo.com.sg", "yahoo.com.ph", "yahoo.com.tw"
# French ISP domains
"hotmail.fr", "live.fr", "laposte.net", "yahoo.fr", "wanadoo.fr", "orange.fr", "gmx.fr", "sfr.fr", "neuf.fr", "free.fr",
# German ISP domains
"gmx.de", "hotmail.de", "live.de", "online.de", "t-online.de", "web.de", "yahoo.de",
# Russian ISP domains
"mail.ru", "rambler.ru", "yandex.ru", "ya.ru", "list.ru",
# Belgian ISP domains
"hotmail.be", "live.be", "skynet.be", "voo.be", "tvcablenet.be", "telenet.be",
# Argentinian ISP domains
"hotmail.com.ar", "live.com.ar", "yahoo.com.ar", "fibertel.com.ar", "speedy.com.ar", "arnet.com.ar",
# Domains used in Mexico
"hotmail.com", "gmail.com", "yahoo.com.mx", "live.com.mx", "yahoo.com", "hotmail.es", "live.com", "hotmail.com.mx", "prodigy.net.mx", "msn.com"
]
commonEmailDomainMap = {}
commonEmailDomainMap[domain] = true for domain in commonEmailDomains