# Preprocessing

In [1]:
import numpy as np
import spacy
import gensim
import os
import re
from gensim.utils import simple_preprocess, lemmatize
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora

from gensim.models.ldamulticore import LdaMulticore

import pandas as pd
nlp = spacy.load("en_core_web_lg")



In [2]:
from bs4 import BeautifulSoup
from os import mkdir
import requests
#Dicitionary used to access full book text in HTML format
urls = {'foucault_madness_and_civ':'https://archive.org/stream/Michel_Foucault_Madness_And_Civilization/Michel%20Foucault%2C%20Richard%20Howard%20%28transl.%29%20-%20Madness%20and%20Civilization_%20A%20History%20of%20Insanity%20in%20the%20Age%20of%20Reason%20%282013%2C%20Vintage%29_djvu.txt',
        'foucault_history_of_sexuality':'https://archive.org/stream/TheHistoryOfSexualityVol13/The-History-Of-Sexuality-Vol-1-3_djvu.txt',
        'chomsky_american_power': 'https://archive.org/stream/AmericanPowerAndTheNewMandarins_201805/American%20Power%20And%20The%20New%20Mandarins_djvu.txt',
        'chomsky_manufacturing_consent': 'https://archive.org/stream/revhosatx14/%5BEdward_S._Herman%2C_Noam_Chomsky%5D_Manufacturing_Con%28b-ok.org%29_djvu.txt'}

In [3]:
def file_books(title, link):
    '''Create directories for book from title and link'''
    #Access HTML webpage on Internet Archive
    r = requests.get(link)
    data = r.text
    soup = BeautifulSoup(data)
    #Full text is in <pre> part of HTML doc
    book = soup.pre.string
    #Write book as text file, save file
    with open(f'./data/{title}.txt', 'w', encoding='utf-8') as book_file:
        book_file.write(book)
        #Make a folder for each book
        mkdir(f'./data/{title}_extracts')
def split_book(title, n_lines=5):
    '''Split a text file based on a number of lines, book title'''
    #Find file path based on title
    filepath = f'./data/{title}.txt'
    #Extract directory and filename from file path
    path, filename = os.path.split(filepath)
    #Change path to book's directory
    path += f'/{title}_extracts'
    # filename.split('.') would not work for filenames with more than one .
    basename, ext = os.path.splitext(filename)
    #open input file
    with open(filepath, 'r', encoding='utf-8') as book_file:
        try:
            #open the first output(extract) file
            extract_file = open(os.path.join(path, '{}_{}{}'.format(basename, 0, ext)), 'w', encoding='utf-8')
            #Loop over all lines of input file, number them
            for i, line in enumerate(book_file):
                #Close extract file and open a new one
                #When the line number % desired n_lines is 0
                if i % n_lines == 0:
                    extract_file.close()
                    #Open the next output file to write the next extract
                    extract_file = open(os.path.join(path, '{}_{}{}'.format(basename, i/100, ext)), 'w', encoding='utf-8')
                #write the line to extract file
                extract_file.write(line)
        finally:
            #close last output file
            extract_file.close()

In [4]:
if not os.path.isdir('./data'):  
    os.mkdir('./data')
    for title, link in urls.items():
        file_books(title, link)
        split_book(title)

In [5]:
def tokenize(doc):
    '''Tokenizer with lemmatizer'''
    return [token.lemma_ for token in nlp(doc) if (token.is_stop == False) &
            (token.is_punct == False) & (token.is_space == False) &
            (token.is_upper == False) & (token.pos_ != 'PROPN')]
def gather_data(path_to_data):
    path = f'./data/{path_to_data}'
    data = {'extracts': [], 'author': []}
    #For file at the given path
    for file in os.listdir(path):
        #If the directory is not a folder
        if os.path.isdir(file) == False:
            #If the file type is .txt
            if file[-3:] == 'txt':
                #Open each text file at the path provided
                with open(os.path.join(path, file), encoding='utf-8') as t:
                    #Read and strip new line signal
                    text = t.read().replace('\n', ' ')
                    data['extracts'].append(str(text))
                    data['author'].append(path_to_data.split('_')[0])
    return data

In [6]:
tokenize('If you should ever leave me, Jack your life would still go on believe me')      

['leave', 'life', 'believe']

In [7]:
extracts_dirs = [folder for folder in os.listdir('./data') if (os.path.isdir(f'./data/{folder}') == True)]

In [8]:
df_final = pd.DataFrame({'extracts': [], 'author': []})
for directory in extracts_dirs:
    extracts = gather_data(directory)
    df = pd.DataFrame(extracts, columns = extracts.keys())
    df_final = pd.concat([df_final, df], axis=0)
#     print(extracts['author'])
#     print(directory)
print(df_final.shape)
df_final.head()

(17839, 2)


Unnamed: 0,extracts,author
0,:| r?|| f • >0* rl; 1=/!= If j,chomsky
1,' a?KJii5K;jp«a;KWSg;«K,chomsky
2,Ffc =£; £r fffjil rI^|l£=£|/F~*f HScIf£||| ...,chomsky
3,ilfiiiliilililllliliilliiliilil: 5ffi||||^p|...,chomsky
4,. liii/sf;,chomsky


# Model

## Train/Test Split

In [9]:
from sklearn.model_selection import train_test_split
X = df_final['extracts']
y = df_final['author']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True)

In [10]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True)

## Randomized Search w/ Random Forest

In [11]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
counter = TfidfVectorizer(
#     max_df = .97,
#                           min_df = 5,
#                           stop_words='english',
# #                           ngram_range= (1,3),
                          tokenizer=tokenize)

rf = RandomForestClassifier()

In [12]:
pipeline = Pipeline([('counter', counter),
                     ('rf', rf)])
parameters = {
    'counter__max_df': [i/100 for i in range(75, 100)],
    'counter__min_df': range(0, 10),
    'counter__ngram_range': [(1,2), (1,1), (1, 3), (2,3)],
    'rf__max_depth': range(5,20),
    'rf__min_samples_split': range(2, 10),
    'rf__min_samples_leaf': range(1,50)
}

rand_search = RandomizedSearchCV(pipeline, parameters, cv=3, n_iter=10, n_jobs=1, verbose=2)

In [None]:
best = rand_search.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] rf__min_samples_split=8, rf__min_samples_leaf=27, rf__max_depth=17, counter__ngram_range=(1, 2), counter__min_df=2, counter__max_df=0.88 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  rf__min_samples_split=8, rf__min_samples_leaf=27, rf__max_depth=17, counter__ngram_range=(1, 2), counter__min_df=2, counter__max_df=0.88, total= 3.5min
[CV] rf__min_samples_split=8, rf__min_samples_leaf=27, rf__max_depth=17, counter__ngram_range=(1, 2), counter__min_df=2, counter__max_df=0.88 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  5.9min remaining:    0.0s


[CV]  rf__min_samples_split=8, rf__min_samples_leaf=27, rf__max_depth=17, counter__ngram_range=(1, 2), counter__min_df=2, counter__max_df=0.88, total= 3.1min
[CV] rf__min_samples_split=8, rf__min_samples_leaf=27, rf__max_depth=17, counter__ngram_range=(1, 2), counter__min_df=2, counter__max_df=0.88 




[CV]  rf__min_samples_split=8, rf__min_samples_leaf=27, rf__max_depth=17, counter__ngram_range=(1, 2), counter__min_df=2, counter__max_df=0.88, total= 3.6min
[CV] rf__min_samples_split=8, rf__min_samples_leaf=11, rf__max_depth=7, counter__ngram_range=(1, 3), counter__min_df=3, counter__max_df=0.98 




In [None]:
best.best_estimator_.predict(X_val)

In [None]:
best.best_estimator_.score(X_val, y_val)

In [None]:
from sklearn.metrics import roc_auc_score
y_pred = best.best_estimator_.predict(X_val)

In [None]:
best.best_estimator_.predict_proba(X_val)

In [None]:
best.best_estimator_.predict_proba(['chomsky'])

In [None]:
best.best_estimator_.predict(["""was therefore suspect from 
the start, and an “alternative model 55 of inducement-pressure coaching 
was plausible and relevant, from the Agca 5 s first implication of Bulgari¬ 
ans. This model became more cogent over time as Agca retracted 
strategic claims, and as no confirming evidence of a Bulgarian Connec¬ 
tion was produced. By the same token, the SHK model, implausible 
from the beginning, became even less tenable. 


4.4. THE MASS MEDIA’S 
UNCRITICAL ACCEPTANCE OF 
THE BULGARIAN CONNECTION 


Despite the implausibility of the SHK claim that Agca had been hired 
by the Bulgarians and the KGB to shoot the pope, and although it was 



THE KGB-BULG ARIAN PLOT TO KILL THE POPE I55 


sustained by argument that amounted to sheer humbuggery, the Bul¬ 
garian Connection met the standard of utility. In this case, therefore, 
as a propaganda model would anticipate, the U.S. mass media accepted 
the SHK model as valid, ignored the alternative model, and par¬ 
ticipated in a classic propaganda campaign that got the message of 
Bulgarian-Soviet guilt over to the public. Some members of the mass 
media helped originate the claim of a Bulgarian Connection, while 
others participated only in disseminating the SHK line (and excluding 
alternative views and inconvenient information). 

The campaign began with Sterling’s Reader’s Digest article of Sep¬ 
tember 1982, which was closely followed by the NBC-TV program of 
September 21, 1982, The outreach of these two statements asserting a 
Bulgarian Connection was great, and they were widely reported upon 
in the rest of the media in the form of a summary of their claims, with 
virtually no questions raised about their validity. With Agca’s Novem¬ 
ber 1982 naming of Bulgarians, the mass media began to report the 
Bulgarian Connection intensively. This reporting was carried out ex¬ 
clusively within the frame of the SHK model, and for most of the mass 
media no serious departures from this model occurred through the 
conclusion of the Rome trial in March 1986. 24 

Agca’s naming of the Bulgarians was the key fact that generated news 
coverage, providing the basis for reiterated details about the Bulgarians, 
explanations of the Bulgarian (and Soviet) motive, and speculation 
about the political implications of the charges, if confirmed. A major 
characteristic of these news reports was their sheer superficiality, with 
the charges never seriously examined but merely regurgitated and 
elaborated with odd facts and opinion, and with no departures from the 
SHK frame (and no hints of the possible relevance of an alternative 
frame). The charges constituted a form of vindication of the SHK 
model if taken at face value and presented superficially—i.e., if the 
media presentations never considered political convenience, prison 
conditions, possible deals, plausible deniability, etc* And this proce¬ 
dure—a reiteration of Agca claims, supplemented by extremely super¬ 
ficial pro-plot speculation—was the principal modality by which the 
mass media accepted and pushed the propaganda line, 

Newsweek provides a prototype of news coverage within the SHK 
framework in its article of January 3,1983, “The Plot to Kill Pope John 
Paul II.” The Bulgarian-Soviet motive as portrayed by SHK is reite¬ 
rated through quotes from congenial sources—“a precautionary and 
alternative solution to the invasion of Poland”—while nobody is quoted 
discussing costs and benefits, the nature of the Soviet leadership, or 
Western benefits from Agca’s confession. 25 In fact, Newsweek suggests 



156 MANUFACTURING CONSENT 

"""])

In [None]:
df_final['author'].value_counts()