In [1]:
import spacy
from spacy.matcher import PhraseMatcher
from spacy.util import minibatch
import pandas as pd
import random
import numpy as np

In [2]:
class spacy_preprocessing_tools():
    
    def __init__(self, language = "en_core_web_sm"):
        
        # english language model
        self.nlp = spacy.load(language)
        
        # phrase matcher
        self.matcher = PhraseMatcher(self.nlp.vocab, attr='LOWER')
        
        # patterns to match
        self.patterns = []
        
         
    def addKeywords(self, keywords): 
        
        # words / group of words to match
        self.patterns += [self.nlp(text) for text in keywords]
        self.matcher.add("TerminologyList", None, *self.patterns)
    
    
    def returnKeywordMatches(self, text):
        
        # find keywords in the text
        text = self.nlp(text)
        matches = self.matcher(text)
        words = []
        
        # append keywords to a string separated by a comma
        for match in matches: 
            match_id, start, end = match
            words.append(str(text[start:end]))
    
        if len(words):       
            return words
            
        return None
        
        
    def addStopwords(self, custom_stopwords):
        
        for words in custom_stopwords:
            self.nlp.vocab[w].is_stop = True
   

    def removeStopwords(self, text):
        text = self.nlp(text)
        return ' '.join([token.text for token in text if not token.is_stop])
    
    
    def lemmatize(self, text):
        text = self.nlp(text)
        return ' '.join([token.lemma_ for token in text])
    
    
    def word2vecEmbeddings(self, text_data):
        
        # large model to get the vectors
        #self.nlp = spacy.load("en_core_web_lg")
        
        # Disable other pipes and determine embedding vector for each text
        with self.nlp.disable_pipes():
            doc_vectors = np.array([self.nlp(text).vector for text in text_data])
            
        print(doc_vectors.shape)
        
        return doc_vectors

In [3]:
class BoWModel():
    
    def __init__(self, language = 'en'):
        '''
        Initialize bag of words model
        '''
        # Create an empty model
        self.nlp = spacy.blank(language)
        
        # Add TextCategorizer with "bow" architecture to model
        self.textcat = self.nlp.create_pipe("textcat", config={"exclusive_classes": True, "architecture": "bow"})
        self.nlp.add_pipe(self.textcat)
        
    def addLabels(self, labels):
        '''
        Add labels to model
        '''
        for label in labels:
            self.textcat.add_label(label)
    
    def fit(self, text_col, label_col):
        '''
        Train the model. Only supports 2 classes at the moment.
        '''        
        X = text_col.values           
        y = [{'cats': {self.textcat.labels[0]: label == self.textcat.labels[0], 
                       self.textcat.labels[1]: label == self.textcat.labels[1]}}
             for label in label_col]
        
        train_data = list(zip(X, y))
        
        random.seed(1)
        spacy.util.fix_random_seed(1)
        optimizer = self.nlp.begin_training()

        losses = {}
        for epoch in range(10):
            random.shuffle(train_data)
            
            # Create the batch generator with batch size = 8
            batches = minibatch(train_data, size=8)
            
            # Iterate through minibatches
            for batch in batches:
                # Each batch is a list of (text, label) but we need to
                # send separate lists for texts and labels to update().
                # This is a quick way to split a list of tuples into lists
                texts, labels = zip(*batch)
                self.nlp.update(texts, labels, sgd=optimizer, losses=losses)
            print(losses)
            
    def score(self):
        pass
    
    def predict(self, texts):
        docs = [self.nlp.tokenizer(text) for text in texts]
    
        # Use textcat to get the scores for each doc
        self.textcat = self.nlp.get_pipe('textcat')
        scores, _ = self.textcat.predict(docs)
        
        # From the scores, find the label with the highest score/probability
        predicted_labels = scores.argmax(axis=1)
        
        return [self.textcat.labels[label] for label in predicted_labels]

In [4]:
news_df = pd.read_csv('data/fake-and-real-news-dataset/combined.csv')

In [5]:
news_df.head()

Unnamed: 0,title,text,subject,date,label
0,WATCH: Six Minutes Of Conservative Media’s Se...,It s no secret that conservatives and Republic...,News,"August 2, 2016",fake
1,Sanders: Firms must take 'haircut' in Puerto R...,WASHINGTON (Reuters) - Wall Street investment ...,politicsNews,"April 1, 2016",real
2,Factbox: Trump fills top jobs for his administ...,(Reuters) - U.S. President-elect Donald Trump ...,politicsNews,"November 29, 2016",real
3,CNBC EDITOR: Media Must Remember Readers Are N...,A CNBC editor said members of the press need t...,left-news,"Jun 29, 2017",fake
4,NYC: Turkish Thugs Beat Up Protesters…Deny Fre...,Remember when these Turkish thugs beat up (see...,politics,"Sep 22, 2017",fake


In [6]:
nlp_process = spacy_preprocessing_tools()

In [7]:
news_df["full_text"] = news_df["title"] + ": " + news_df["text"] 
news_df.head()

Unnamed: 0,title,text,subject,date,label,full_text
0,WATCH: Six Minutes Of Conservative Media’s Se...,It s no secret that conservatives and Republic...,News,"August 2, 2016",fake,WATCH: Six Minutes Of Conservative Media’s Se...
1,Sanders: Firms must take 'haircut' in Puerto R...,WASHINGTON (Reuters) - Wall Street investment ...,politicsNews,"April 1, 2016",real,Sanders: Firms must take 'haircut' in Puerto R...
2,Factbox: Trump fills top jobs for his administ...,(Reuters) - U.S. President-elect Donald Trump ...,politicsNews,"November 29, 2016",real,Factbox: Trump fills top jobs for his administ...
3,CNBC EDITOR: Media Must Remember Readers Are N...,A CNBC editor said members of the press need t...,left-news,"Jun 29, 2017",fake,CNBC EDITOR: Media Must Remember Readers Are N...
4,NYC: Turkish Thugs Beat Up Protesters…Deny Fre...,Remember when these Turkish thugs beat up (see...,politics,"Sep 22, 2017",fake,NYC: Turkish Thugs Beat Up Protesters…Deny Fre...


In [8]:
news_df["full_text"][0]

' WATCH: Six Minutes Of Conservative Media’s Sexist Assault On Hillary Clinton: It s no secret that conservatives and Republicans (not all) despise President Obama simply because he is black. After years of insisting he s an  Arab,  from Kenya, a Muslim, or anything else you can name, it s telling that some Americans hold him to a completely different standard simply because he s black.Now take Hillary Clinton, who is on the cusp of making history as the first female President of the United States, just as Barack Obama did as the first African American president. For years, Clinton has been in the public eye as a governor s wife, a First Lady, a senator, a Secretary of State, and now a presidential candidate. But like their hate for President Obama simply because he is black, there are those on the right who hate her simply because she is a woman. Like the dog whistle politics that follow President Obama, conservatives have unleashed their own dog whistle on Clinton   and they are even

In [9]:
bow = BoWModel()
bow.addLabels(news_df.label.unique().tolist())

In [10]:
test = news_df.sample(frac = 0.025).copy()
len(test)

1122

In [11]:
test["full_text"] = test["full_text"].apply(lambda x: nlp_process.removeStopwords(x))
test["full_text"] = test["full_text"].apply(lambda x: nlp_process.lemmatize(x))

In [12]:
doc_vectors = nlp_process.word2vecEmbeddings(test.full_text)

(1122, 96)


In [38]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(doc_vectors, test.label,
                                                    test_size=0.2, random_state=1)

In [51]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf']}
grid = GridSearchCV(SVC(),param_grid,refit=True)
grid.fit(X_train,y_train)

print(grid.best_estimator_)



SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [53]:
svc = SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

svc.fit(X_train, y_train)
print(f"Train Accuracy: {svc.score(X_train, y_train) * 100:.3f}%", )
print(f"Test Accuracy: {svc.score(X_test, y_test) * 100:.3f}%", )

Train Accuracy: 95.206%
Test Accuracy: 92.000%


Optimization of the C regularization parameter to reduce overfitting 

In [47]:
from sklearn.svm import SVC

svc = SVC(C=1.65, kernel='rbf', gamma='scale', probability=False, tol=0.001, 
                 cache_size=200, class_weight=None, verbose=False, max_iter=-1, 
                 decision_function_shape='ovr', random_state=None)

svc.fit(X_train, y_train)
print(f"Train Accuracy: {svc.score(X_train, y_train) * 100:.3f}%", )
print(f"Test Accuracy: {svc.score(X_test, y_test) * 100:.3f}%", )

Train Accuracy: 93.200%
Test Accuracy: 93.333%
