In [23]:
import os
import string
import nltk
import numpy as np
import pandas as pd
from PIL import Image
from pytesseract import image_to_string
from skimage.transform import rescale
from pycorenlp import StanfordCoreNLP
from collections import Counter
from nltk.corpus import stopwords



def getFileNames(tabloidDir, misleadingDir, goodDir):
    filenames = []

    for filename in os.listdir(tabloidDir):
        filename = str(filename)
        filenames.append("Tabloid Advertorial/"+filename)
        
    for filename in os.listdir(misleadingDir):
        filename = str(filename)
        filenames.append("Misleading Claims/" + filename)
        
    for filename in os.listdir(goodDir):
        filename = str(filename)
        filenames.append("Good Ads/" + filename)
        
    return filenames

In [24]:
filenames = getFileNames("Tabloid Advertorial", "Misleading Claims", "Good Ads")

In [25]:
def getText(tabloidDir, misleadingDir, goodDir):
    texts = []
    
    for filename in os.listdir(tabloidDir):
        texts.append(readText("Tabloid Advertorial/"+filename))
    for filename in os.listdir(misleadingDir):
        texts.append(readText("Misleading Claims/"+filename))
    for filename in os.listdir(goodDir):
        texts.append(readText("Good Ads/"+filename))
        
    return texts

def readText(filename):
    image = np.asarray(Image.open(filename))
    image = rescale(image, 3, mode = "reflect")
    image *= 255
    image=Image.fromarray(image.astype('uint8'))
    
    clean = image_to_string(image).replace("\n", ". ")
    clean= ''.join([x for x in clean if (x in string.ascii_letters + string.digits + " " + ".") ])
    return clean

In [26]:
texts = getText("Tabloid Advertorial", "Misleading Claims", "Good Ads")

In [27]:
texts = [msg.lower() for msg in texts]

In [29]:
# Must first run Stanford CoreNLP server from the command line using:
# "java -mx5g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -timeout 10000"
# from the CoreNLP directory

nlp = StanfordCoreNLP('http://localhost:9000')

def getSentiments(texts):
    sentiments = []
    for adMessage in texts:
        res = nlp.annotate(adMessage,
                           properties={
                               'annotators': 'sentiment', 
                               'outputFormat': 'json',
                               'timeout': 1000,
                           })

        overallSentiment = 0
        try: 
            for s in res["sentences"]:
                if (s["sentiment"] == "Negative"):
                    overallSentiment -= int(float(s["sentimentValue"]))
                elif (s["sentiment"] == "Positive"):
                    overallSentiment += int(float(s["sentimentValue"]))
            sentiments.append(overallSentiment)
        except:
            sentiments.append(0)
    return sentiments

In [30]:
sentiments = getSentiments(texts)

In [32]:
d = {'file': filenames, 'sentiments': sentiments, 'text':texts}
df = pd.DataFrame(data=d)
df['pos sent'] = [1 if val >= 0 else 0 for val in df['sentiments']]
df['is tabloid'] = [1 if file.startswith("Tabloid") else 0 for file in df['file']]
df['is misleading'] = [1 if file.startswith("Misleading") else 0 for file in df['file']]
df['is bad'] = [1 if file.startswith("Tabloid") or file.startswith("Misleading") else 0 for file in df['file']]
df['type'] = [file[:file.index(" ")] for file in df['file']]

In [34]:
def getBigrams(category):
    badtexts = df.loc[df[category] == 1]["text"]
    words = nltk.word_tokenize(" ".join(badtexts.tolist()))
    cleanerWords = [i for i in words if i.isalpha() and len(i) > 2]
    bigrams = nltk.bigrams(cleanerWords)
#    print(Counter(bigrams).most_common()) #e.g. read more, more sponsored, simple trick, discover how, stay young, hollywoods elite
    return Counter(bigrams)

def getKeywords(category):
    stop = stopwords.words('english')
    badtexts = df.loc[df[category] == 1]["text"]
    words = nltk.word_tokenize(" ".join(badtexts.tolist()))
    cleanerWords = [i for i in words if i.isalpha() and len(i) > 2 and i not in stop]
#    print(Counter(cleanerWords).most_common()) #e.g. read, sponsored, reveals, trick, seconds
    return Counter(cleanerWords)

In [35]:
def language_score(language, texts):
    filterWords = ["nonsecure", "view", "chrome", "file", "edit", "help", "bookmarks", "tab", "creatives", "inventory", "preview", "window"]
    scores = []
    for msg in texts:
        score = 0
        for word in language:
            wordFreq = language[word]
            if type(word) == tuple:
                word = ' '.join(word)
            if word in filterWords or word[:word.find(" ")] in filterWords:
                continue
            if msg.find(word) != -1:
                score += wordFreq
        scores.append(score)
    return scores

In [36]:
df["word score"] = language_score(getKeywords("is bad"), texts)
df["bigram score"] = language_score(getBigrams("is bad"), texts)
df["tabloid word score"] = language_score(getKeywords("is tabloid"), texts)
df["tabloid bigram score"] = language_score(getBigrams("is tabloid"), texts)
df["misleading word score"] = language_score(getKeywords("is misleading"), texts)
df["misleading bigram score"] = language_score(getBigrams("is misleading"), texts)
df

Unnamed: 0,file,sentiments,text,pos sent,is tabloid,is misleading,is bad,type,word score,bigram score,tabloid word score,tabloid bigram score,misleading word score,misleading bigram score
0,Tabloid Advertorial/1.JPG,12,creative id 86130765. . this woman treat her. ...,1,1,0,1,Tabloid,315,115,124,44,191,71
1,Tabloid Advertorial/10.JPG,6,creative id 87750553. . 9 regrow hair fast. . ...,1,1,0,1,Tabloid,251,56,97,26,154,30
2,Tabloid Advertorial/11.JPG,2,creative id 87838399. quaccowp. . pioneeting l...,1,1,0,1,Tabloid,151,19,63,18,88,1
3,Tabloid Advertorial/12.JPG,6,creative id 87994982. . . ifyoure born before ...,1,1,0,1,Tabloid,200,40,62,11,138,29
4,Tabloid Advertorial/13.JPG,1,creative id 88160299. . odd trick. to fix. sag...,1,1,0,1,Tabloid,134,11,48,4,86,7
5,Tabloid Advertorial/14.JPG,3,creative id 88233775. . i . g. 1. . the dirty ...,1,1,0,1,Tabloid,109,46,56,25,53,21
6,Tabloid Advertorial/15.JPG,2,creative id 88487685. . americans cannot stop....,1,1,0,1,Tabloid,307,85,139,39,168,46
7,Tabloid Advertorial/2.JPG,6,creative id 86141551. . 5 myths about milk. . ...,1,1,0,1,Tabloid,266,88,122,43,144,45
8,Tabloid Advertorial/3.JPG,5,creative id 86157903. . stem cell. breakthroug...,1,1,0,1,Tabloid,199,7,92,7,107,0
9,Tabloid Advertorial/4.JPG,2,creative id 86158228. . this retirement. looph...,1,1,0,1,Tabloid,340,103,139,43,201,60


In [37]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# df = df[df.index<245]
# X = df[['pos sent', 'word score', 'bigram score']]
# X = df[['pos sent']]
X = df[['pos sent', 'tabloid word score', 'tabloid bigram score', 'misleading word score', 'misleading bigram score']]
#Y = df[['is bad']]
Y = df[['type']]

In [44]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5)

clf = DecisionTreeClassifier()
clf = clf.fit(X_train, Y_train)
clf.predict(X_test)
clf.score(X_test, Y_test)

0.845

In [45]:
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
kfold = model_selection.KFold(n_splits=50, random_state=17)
model = RandomForestClassifier(n_estimators=10)
results = model_selection.cross_val_score(model, X, Y.values.ravel(), cv=kfold)
print(results.mean())

0.84


In [46]:
#####THIS IS THE FINAL MODEL#####

from sklearn import svm

model = svm.SVC(kernel='linear', C=1, gamma=1)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y.values.ravel(), test_size=0.25)
model.fit(X_train, Y_train)
model.score(X_test, Y_test)


0.9