Permalink
Cannot retrieve contributors at this time
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
245 lines (210 sloc)
9.19 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| require 'mechanize' | |
| #require 'logger' | |
| require 'open-uri' | |
| require 'yaml' | |
| require 'io/console' | |
| START_THREAD_CUTOFF = 120000 | |
| # sciencemadness domain whitelist, built from forum links posted between 2002 and 2016 | |
| POPULAR_DOMAINS = open("sm-linked-domains.txt").read.split("\n") | |
| #$log = Logger.new "/home/tom/workspace/mechlog.txt" | |
| #$log.level = Logger::DEBUG | |
| # I don't think I use these methods now that I understand nokogiri better | |
| class Nokogiri::XML::Element | |
| def grandchildren | |
| self.children.children | |
| end | |
| def great_grandchildren | |
| self.children.children.children | |
| end | |
| end | |
| # putting all these methods in a module, just to make things a bit neater | |
| module BK | |
| # base uri for the forum | |
| $uri = URI("http://www.sciencemadness.org/talk/") | |
| $tid_cutoff = START_THREAD_CUTOFF | |
| # request the password on each execution, so it doesn't need to be stored | |
| user = "Melgar" | |
| print "Password: " | |
| # password is no longer saved, and should not show up in any logs ever | |
| pw = '' | |
| loop do | |
| pw << STDIN.getch | |
| if pw[-1] == "\u0003" | |
| puts "Password input cleared, starting over." | |
| pw = '' | |
| elsif pw[-1] == "\r" | |
| pw = pw.chop | |
| break | |
| end | |
| end | |
| # create a mechanize agent named 'botkilla' and navigate to the login page | |
| $botkilla = Mechanize.new # {|a| a.log = $log} | |
| $botkilla.user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/69.0.3497.81 Chrome/69.0.3497.81 Safari/537.36" | |
| $botkilla.get("http://www.sciencemadness.org/talk/misc.php?action=login") | |
| $login_form = $botkilla.page.forms.first | |
| $last_error = "" | |
| $users = {} | |
| $checked_threads = [] | |
| $kill_count = 0 | |
| $start_time = Time.now | |
| $users_updated = Time.at(0) | |
| # enter username and password into form, then log in | |
| $login_form.field_with(:name => "username").value = user | |
| $login_form.field_with(:name => "password").value = pw | |
| $login_form.click_button | |
| pw = '' | |
| # parse the "today's posts" page | |
| def self.new_posts | |
| $botkilla.get($uri.to_s + "today.php") | |
| ar = [] | |
| # get HTML elements that are rows with class 'tablerow' | |
| $botkilla.page.xpath("//tr[@class='tablerow']").each do |tr| | |
| h = {} | |
| begin | |
| # pull link, title, username, etc. out of the nested Nokogiri mess | |
| title_cell = tr.at_xpath("./td[@width='43%']/font/a") | |
| next if title_cell.nil? | |
| h['link'] = title_cell['href'] | |
| h['title'] = title_cell.text.force_encoding("UTF-8").encode("ISO-8859-1", invalid: :replace, undef: :replace, replace: '').force_encoding("UTF-8").scrub | |
| h['username'] = tr.at_xpath("./td[@width='14%' and @bgcolor='#fffbe8']").text | |
| h['replies'] = tr.xpath("./td[@width='5%' and @bgcolor='#fffbe8']/font").text.to_i | |
| h['last_poster'] = tr.at_xpath("./td[@width='23%']/table/tr/td/font/a").text | |
| h['tid'] = h['link'][/(?<=tid=)\d+/].to_i | |
| rescue => e | |
| if $last_error == e.to_s | |
| print "." | |
| else | |
| $last_error = e.to_s | |
| print e.to_s | |
| end | |
| next | |
| end | |
| ar << h | |
| end | |
| return ar.select {|h| h['tid'] > $tid_cutoff} | |
| end | |
| # refresh list of most recently registered users | |
| def self.update_users | |
| $botkilla.get($uri.to_s + "misc.php?action=list&desc=desc") | |
| rows = $botkilla.page.search("tr") | |
| $users = rows[11].text.scan(/(?<=\r\n)(.+?)\r\nMember[\r\n]+(\d+\/\d+\/\d+)\r\n(\d+)/) | |
| $users.map! {|a| [a[0], {"reg_date" => DateTime.strptime(a[1], '%m/%d/%y').to_time, "post_count" => a[2].to_i}]} | |
| $users = $users.to_h | |
| $users_updated = Time.now | |
| # maps the users on "today's posts" with how many posts they have on that page | |
| ulist = BK.new_posts.map {|u| u['username']} | |
| $recent_post_count = ulist.inject({}) {|h,v| h.merge(v => h[v].to_i + 1)} | |
| end | |
| # use moderator tools to delete a thread | |
| def self.delete_thread(h) | |
| begin | |
| $botkilla.get("http://www.sciencemadness.org/talk/topicadmin.php?tid=#{h['tid']}&action=delete") | |
| $botkilla.page.forms.first.click_button | |
| puts "Deleted thread with title \e[1;37m'#{h['title']}'\e[0m at \e[1;37m#{Time.now.ctime}\e[0m because \e[1;37m#{h['flags'].join(', ')}\e[0m." | |
| $kill_count += 1 | |
| run_time = Time.now - $start_time | |
| new_cutoff = h['tid'].to_i - 100 | |
| $tid_cutoff = new_cutoff if new_cutoff > $tid_cutoff # increment $tid_cutoff | |
| puts "\n\n" | |
| puts "\e[1;32mKilled #{$kill_count} spam posts in #{(run_time/3600).to_i} hours and #{((run_time % 3600)/60).to_i} minutes. (#{(($kill_count*3600)/run_time).round(2)} kills/hour)\e[0m" | |
| rescue => e | |
| puts e | |
| end | |
| end | |
| # get a bunch more background info on a particular thread | |
| def self.investigate_thread(h) | |
| # If the title contains words common in spam titles | |
| if h['title'][/(sex|passionate|adult|porn|galleries|unencumbered|mature|viagra|cialis|callow|teen|casino|jerseys|passports|\p{^ASCII})/i] | |
| h['spam_score'] = h['spam_score'].to_i + 4 | |
| if h['title'][/\p{^ASCII}/] | |
| h['flags'] = h['flags'].to_a + ['non-ASCII characters in title'] | |
| else | |
| h['flags'] = h['flags'].to_a + ['spam words in title'] | |
| end | |
| end | |
| # if the list of most recently registered users includes the user in question, add | |
| # 5 points for registereing in the last 36 hours, 2 points oherwise. | |
| if $users.keys.include? h['username'] | |
| if (Time.now - $users[h['username']]['reg_date']) / 3600 < 36 | |
| h['spam_score'] = h['spam_score'].to_i + 5 | |
| h['flags'] = h['flags'].to_a + ['registered since yesterday'] | |
| else | |
| h['spam_score'] = h['spam_score'].to_i + 2 | |
| h['flags'] = h['flags'].to_a + ['registered recently'] | |
| end | |
| end | |
| # if the user has posted unusually frequently | |
| if $recent_post_count[h['username']].to_i > 5 | |
| h['spam_score'] = h['spam_score'].to_i + 4 | |
| h['flags'] = h['flags'].to_a + ['flood-posting'] | |
| end | |
| # scan the text of a post for links, then determine if they're appropriate links for this site | |
| $botkilla.get($uri.to_s + h['link']) # rescue return h | |
| body = $botkilla.page.at_xpath("//td[@class='tablerow' and @valign='top' and @style='height: 80px; width: 82%']/font[@class='mediumtxt']") | |
| h['thread_text'] = body.text | |
| links = body.xpath("./a").map {|e| e['href']}.join(' ') # adds linked urls to body text for scanning | |
| # fix encoding glitches between ISO-8859-1 and UTF-8 | |
| h['thread_text'] = h['thread_text'].force_encoding("UTF-8").encode("ISO-8859-1", invalid: :replace, undef: :replace, replace: '').force_encoding("UTF-8").scrub # rescue puts "UTF-8 ERROR!" | |
| h['thread_text'] += links | |
| # extract URL domains, then group domains by whether they appear on the internal whitelist | |
| domains = h['thread_text'].scan(/https?:\/\/([\w\.-]+)/).flatten | |
| verdict = domains.group_by {|d| POPULAR_DOMAINS.include?(d)} | |
| # number of links to unrecognized domains that were posted | |
| if verdict[false].to_a.length > 0 | |
| h['spam_score'] = h['spam_score'].to_i + 5 | |
| h['flags'] = h['flags'].to_a + ['linking to an unrecognized domain'] | |
| end | |
| # check and see if spammy phrases/words are used in the post text, or if there's any content at all | |
| if h['thread_text'].scan(/(fake ?passport|lolita|viagra|cialis)/i).length > 0 | |
| h['spam_score'] = h['spam_score'].to_i + 5 | |
| h['flags'] = h['flags'].to_a + ['spam phrase in text'] | |
| elsif h['thread_text'][/\w/].nil? # no letters or numbers in the entire post | |
| h['spam_score'] = h['spam_score'].to_i + 5 | |
| h['flags'] = h['flags'].to_a + ['content-free post'] | |
| end | |
| # check for strange character sets in the title and body | |
| %w(Cyrillic Han Thai Arabic Tagalog).each do |charset| | |
| if h['title'][/\p{#{charset}}/] | |
| h['spam_score'] = h['spam_score'].to_i + 3 | |
| h['flags'] = h['flags'].to_a + ["#{charset} characters in title"] | |
| if h['thread_text'].scan(/\p{#{charset}}/i).length > h['thread_text'].scan(/[a-z]/i).length | |
| h['spam_score'] += 4 | |
| h['flags'] = h['flags'].to_a + ["#{charset} characters in the body"] | |
| end | |
| break | |
| end | |
| end | |
| # checks to see if user is creating a poll for first post, or is submitting a link for first post | |
| posts = $botkilla.page.xpath("//div[@class='smalltxt']").text[/(?<=Posts: )\d+/].to_i | |
| if posts < 1 | |
| h['spam_score'] = h['spam_score'].to_i + 6 | |
| h['flags'] = h['flags'].to_a + ['poll as first post'] | |
| elsif posts == 1 | |
| h['spam_score'] = h['spam_score'].to_i + 3 | |
| h['flags'] = h['flags'].to_a + ['first post ever'] | |
| end | |
| return h | |
| end | |
| # run this every time you want to check for spam, on a continuous loop to check continuously | |
| def self.kill_spam | |
| # refresh list of new users and list of new posts | |
| BK.update_users | |
| ar = BK.new_posts | |
| # cycle through new posts, analyzing each one for markers of spam | |
| ar.each do |h| | |
| next if $checked_threads.include?(h['tid']) | |
| # calculate a bunch of spam-related stats for a thread | |
| h = BK.investigate_thread(h) | |
| # delete if cumulative spam score is 10 or more | |
| if h['spam_score'].to_i >= 10 && h['tid'] > $tid_cutoff | |
| $last_error = "" | |
| puts h.to_yaml | |
| puts "Spam score is #{h['spam_score']}. Deleting it..." | |
| #x = gets.chop | |
| BK.delete_thread(h) #if x.downcase == "y" | |
| else | |
| $checked_threads << h['tid'] | |
| end | |
| end | |
| end | |
| end |