Skip to content

Commit

Permalink
Prevent re-listing in less than 30 days.
Browse files Browse the repository at this point in the history
The words in a listing are now scraped into a bag of words. All new listings are now checked against previous listings from the same email domain. If there is a more than 60% match in the bags of words, the listing is refused. We use this approach instead of a direct comparison to make it more robust. Listings will not be considered unique with simple formatting changes, by shifting words around, or by substituting a few throwaway lines. They'll need to be substantially different. Listings are checked against the email domain rather than email address because new addresses are cheap while new domains are not.

This commit also fixes a long known bug that allowed the email address to be changed after a listing was published.
  • Loading branch information
jace committed Nov 1, 2011
1 parent b8e1d38 commit 09cd6ff
Show file tree
Hide file tree
Showing 7 changed files with 145 additions and 42 deletions.
2 changes: 1 addition & 1 deletion forms.py
Expand Up @@ -38,7 +38,7 @@ class ListingForm(Form):
validators=[Required(u"If this job doesn’t have a fixed location, use “Anywhere”")])
job_relocation_assist = BooleanField("Relocation assistance available")
job_description = TextAreaField("Description",
description=u"Our apologies for the mismatched font you see here. We’re working on it.",
description=u"Our apologies for the mismatched font you see here. We’re working on it",
validators=[Required("A description of the job is required")])
job_perks = BooleanField("Job perks are available")
job_perks_description = TextAreaField("Describe job perks",
Expand Down
1 change: 1 addition & 0 deletions models.py
Expand Up @@ -126,6 +126,7 @@ class JobPost(db.Model):
md5sum = db.Column(db.String(32), nullable=False, index=True)

# Payment, audit and workflow fields
words = db.Column(db.UnicodeText, nullable=True) # All words in description, perks and how_to_apply
promocode = db.Column(db.String(40), nullable=True)
status = db.Column(db.Integer, nullable=False, default=POSTSTATUS.DRAFT)
ipaddr = db.Column(db.String(45), nullable=False)
Expand Down
8 changes: 8 additions & 0 deletions sass/_layout.scss
Expand Up @@ -193,6 +193,9 @@ footer {
.post-company-logo {
float: right;
}
#apply-info {
line-height: 1.5;
}
#detailed-info {
float: left;
max-width: 600px;
Expand Down Expand Up @@ -524,5 +527,10 @@ h2 {
margin: 1em 0;
}

#newpost_headline:hover {
@include border-radius(2px);
@include box-shadow(#ccf 0 0 5px, #ccf 0 0 5px 0 inset);
}

/* Spam protection */
.z {display: none;}
16 changes: 16 additions & 0 deletions static/css/screen.css
Expand Up @@ -531,6 +531,9 @@ footer {
#sheet .post-company-logo {
float: right;
}
#sheet #apply-info {
line-height: 1.5;
}
#sheet #detailed-info {
float: left;
max-width: 600px;
Expand Down Expand Up @@ -930,6 +933,19 @@ h2 {
margin: 1em 0;
}

#newpost_headline:hover {
-moz-border-radius: 2px;
-webkit-border-radius: 2px;
-o-border-radius: 2px;
-ms-border-radius: 2px;
-khtml-border-radius: 2px;
border-radius: 2px;
-moz-box-shadow: #ccccff 0 0 5px, #ccccff 0 0 5px 0 inset;
-webkit-box-shadow: #ccccff 0 0 5px, #ccccff 0 0 5px 0 inset;
-o-box-shadow: #ccccff 0 0 5px, #ccccff 0 0 5px 0 inset;
box-shadow: #ccccff 0 0 5px, #ccccff 0 0 5px 0 inset;
}

/* Spam protection */
.z {
display: none;
Expand Down
2 changes: 1 addition & 1 deletion templates/detail.html
Expand Up @@ -75,7 +75,7 @@ <h2>What’s wrong with it?</h2>
</div>
</div>
</div>
<div class="section">
<div class="section" id="apply-info">
<h2>Apply for this position</h2>
<p>{{ post.how_to_apply|scrubemail(('z', 'y')) }}</p>
</div>
Expand Down
54 changes: 54 additions & 0 deletions utils.py
Expand Up @@ -188,6 +188,60 @@ def convertemail(m):
return data


WORDSPLIT_RE = re.compile('\W+')
TAGSPLIT_RE = re.compile('<.*?>')

def striptags(text):
"""
Remove HTML/XML tags from text, inserting spaces in their place:
>>> striptags('<h1>title</h1>')
' title '
>>> striptags('plain text')
'plain text'
>>> striptags(u'word<br>break')
u'word break'
"""
return TAGSPLIT_RE.sub(' ', text)


def getwords(text):
"""
Get words in text by splitting text along punctuation
and stripping out the punctuation:
>>> getwords('this is some text.')
['this', 'is', 'some', 'text']
>>> getwords('and/or')
['and', 'or']
>>> getwords('one||two')
['one', 'two']
>>> getwords("does not is doesn't")
['does', 'not', 'is', 'doesn', 't']
>>> getwords(u'hola unicode!')
[u'hola', u'unicode']
"""
result = WORDSPLIT_RE.split(text)
# Blank tokens will only be at beginning or end of text.
if result[0] == '':
result.pop(0)
if result and result[-1] == '':
result.pop(-1)
return result


def get_word_bag(text):
"""
Return a string containing all unique words in the given text, in alphabetical order.
>>> get_word_bag("This is a piece\tof text with this extra bit!")
'a bit extra is of piece text this with'
"""
words = list(set(simplify_text(striptags(text)).split(' ')))
words.sort()
return " ".join(words)


if __name__ == '__main__':
import doctest
doctest.testmod()
104 changes: 64 additions & 40 deletions views.py
Expand Up @@ -6,6 +6,7 @@
from datetime import date, datetime, timedelta
from urllib import quote, quote_plus
from pytz import utc, timezone
from difflib import SequenceMatcher
from flask import (render_template, redirect, url_for, request, session, abort,
flash, g, Response, Markup, escape, jsonify)
from flaskext.mail import Mail, Message
Expand All @@ -16,7 +17,7 @@
from models import db, POSTSTATUS, JobPost, JobType, JobCategory, JobPostReport, ReportCode, unique_hash, agelimit
import forms
from uploads import uploaded_logos, process_image
from utils import sanitize_html, scrubemail, md5sum, get_email_domain
from utils import sanitize_html, scrubemail, md5sum, get_email_domain, get_word_bag
from search import do_search

mail = Mail()
Expand Down Expand Up @@ -273,7 +274,7 @@ def jobdetail(hashid):
db.session.add(report)
db.session.commit()
if request.is_xhr:
return "<p>Thanks! This job listing has been flagged for review.</p>" #Ugh!
return "<p>Thanks! This job listing has been flagged for review.</p>" #FIXME: Ugh!
else:
flash("Thanks! This job listing has been flagged for review.", "interactive")
elif request.method == 'POST' and request.is_xhr:
Expand Down Expand Up @@ -375,48 +376,71 @@ def editjob(hashid, key, form=None, post=None, validated=False):
form.job_type.choices = [(ob.id, ob.title) for ob in JobType.query.filter_by(public=True).order_by('seq')]
form.job_category.choices = [(ob.id, ob.title) for ob in JobCategory.query.filter_by(public=True).order_by('seq')]
if post is None:
post = JobPost.query.filter_by(hashid=hashid).first()
if post is None:
abort(404)
post = JobPost.query.filter_by(hashid=hashid).first_or_404()
if key != post.edit_key:
abort(403)
#if request.method == 'POST' and post.status != POSTSTATUS.DRAFT:
# form.poster_email.data = post.email
# Don't allow email address to be changed once its confirmed
if request.method == 'POST' and post.status >= POSTSTATUS.CONFIRMED:
form.poster_email.data = post.email
if request.method == 'POST' and (validated or form.validate()):
post.headline = form.job_headline.data
post.type_id = form.job_type.data
post.category_id = form.job_category.data
post.location = form.job_location.data
post.relocation_assist = form.job_relocation_assist.data
post.description = sanitize_html(form.job_description.data)
post.perks = sanitize_html(form.job_perks_description.data) if form.job_perks.data else ''
post.how_to_apply = form.job_how_to_apply.data
post.company_name = form.company_name.data
post.company_url = form.company_url.data
post.email = form.poster_email.data
post.email_domain = get_email_domain(post.email)
post.md5sum = md5sum(post.email)

# TODO: Provide option of replacing logo or leaving it alone
if request.files['company_logo']:
thumbnail = g.company_logo
#if 'company_logo' in g:
# # The validator saved a copy of the processed logo
# thumbnail = g['company_logo']
#else:
# thumbnail = process_image(request.files['company_logo'])
logofilename = uploaded_logos.save(thumbnail, name='%s.' % post.hashid)
post.company_logo = logofilename
form_description = sanitize_html(form.job_description.data)
form_perks = sanitize_html(form.job_perks_description.data) if form.job_perks.data else ''
form_how_to_apply = form.job_how_to_apply.data
form_email_domain = get_email_domain(form.poster_email.data)
form_words = get_word_bag(u' '.join((form_description, form_perks, form_how_to_apply)))

similar = False
for oldpost in JobPost.query.filter(JobPost.email_domain == form_email_domain).filter(
JobPost.status > POSTSTATUS.PENDING).filter(
JobPost.datetime > datetime.utcnow() - agelimit).all():
if oldpost.id != post.id:
if oldpost.words:
s = SequenceMatcher(None, form_words, oldpost.words)
if s.ratio() > 0.6:
similar = True
break

if similar:
flash("This listing is very similar to an earlier listing. You may not relist the same job "
"in less than %d days. If you believe this to be an error, please email us at %s." % (agelimit.days,
app.config['ADMINS'][0]), category='interactive')
else:
if form.company_logo_remove.data:
post.company_logo = None
post.headline = form.job_headline.data
post.type_id = form.job_type.data
post.category_id = form.job_category.data
post.location = form.job_location.data
post.relocation_assist = form.job_relocation_assist.data
post.description = form_description
post.perks = form_perks
post.how_to_apply = form_how_to_apply
post.company_name = form.company_name.data
post.company_url = form.company_url.data
post.email = form.poster_email.data
post.email_domain = form_email_domain
post.md5sum = md5sum(post.email)
# To protect from gaming, don't allow words to be removed in edited listings once the post
# has been confirmed. Just add the new words.
if post.status >= POSTSTATUS.CONFIRMED:
prev_words = post.words or ''
else:
prev_words = u''
post.words = get_word_bag(u' '.join((prev_words, form_description, form_perks, form_how_to_apply)))

if request.files['company_logo']:
# The form's validator saved the processed logo in g.company_logo.
thumbnail = g.company_logo
logofilename = uploaded_logos.save(thumbnail, name='%s.' % post.hashid)
post.company_logo = logofilename
else:
if form.company_logo_remove.data:
post.company_logo = None

db.session.commit()
userkeys = session.get('userkeys', [])
userkeys.append(post.edit_key)
session['userkeys'] = userkeys
session.permanent = True
return redirect(url_for('jobdetail', hashid=post.hashid), code=303)
db.session.commit()
userkeys = session.get('userkeys', [])
userkeys.append(post.edit_key)
session['userkeys'] = userkeys
session.permanent = True
return redirect(url_for('jobdetail', hashid=post.hashid), code=303)
elif request.method == 'POST':
flash("Please correct the indicated errors", category='interactive')
elif request.method == 'GET':
Expand All @@ -434,7 +458,7 @@ def editjob(hashid, key, form=None, post=None, validated=False):
form.company_url.data = post.company_url
form.poster_email.data = post.email

return render_template('postjob.html', form=form)#, no_email=post.status != POSTSTATUS.DRAFT)
return render_template('postjob.html', form=form, no_email=post.status > POSTSTATUS.DRAFT)


@app.route('/new', methods=('GET', 'POST'))
Expand Down

0 comments on commit 09cd6ff

Please sign in to comment.