# Supervised machine learning with text
### The case of having access to a large newspaper dataset with annotated topics

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
import joblib

I started out with having a folder with all the newspaper articles (in .txt files) that were annotated. Additionally, I had a file called np_workfile.csv that included a lot of infromation about the articles and -- most importantly -- the topic annotations. Starting from this, I first had to process the text in all the article files and get it together with the annotation file. This is the code I used (with explanations on the side): 

In [None]:
from sys import maxunicode #library to get everything that might be punctuation
import unicodedata #library to get everything that might be punctuation
from nltk.corpus import stopwords #stopword list
from nltk.tokenize import word_tokenize #tokenizer
from nltk.stem.snowball import SnowballStemmer #stemmer
import nltk #also including the overall model since later on other submodules are addressed

np_data = pd.read_csv('/home/felicia/News_classifier/np_workfile.csv') #reading the file with the annotations
tbl = dict.fromkeys(i for i in range(maxunicode) if unicodedata.category(chr(i)).startswith('P')) #list of all punctuation
stopwords_list = set(stopwords.words('dutch')) #list of all Dutch stopwords
stemmer=SnowballStemmer('dutch') #initializing Dutch stemmer

#since the name of the full text to the newspaper articles was a column in the CSV file
#I had to write a function that points to the folder where I saved the articles and adds the filename to it
#
def substitute_location(text):
    text_new = '/home/felicia/News_classifier/articles/' + text 
    return text_new

np_data['Filename']=np_data['V3'].apply(substitute_location) #Making a new column called filename with the full path

'''
Defining a function that: 
- opens every file
- reads it and splits it, filters empty lines out
- Since those are LexisNexis files, I made two general rules to find the beginning and end of the main text
- retrieving the main text between beginning and end, joining together the separate lines to have one string and not a list
- removing weird characters and punctuation
- tokenizing the text
- stemming and lowercasing the text
- making bigrams
- joining unigrams and bigrams to one processed text
'''
def process_text(filename):
    with open(filename) as fi:
        text = fi.read().splitlines()
        text = list(filter(None,text))
        beginning = [i for i, s in enumerate(text) if 'LENGTH' in s or 'DATELINE' in s or 'LENGTE' in s]
        end = [i for i, s in enumerate(text) if 'LOAD-DATE' in s or 'LANGUAGE' in s]
        text = text[beginning[-1]+1:end[0]]
        text = ''.join(text)
        text = text.replace(u"`",u"").replace(u"´",u"").translate(tbl)
        text = word_tokenize(text)
        text = [stemmer.stem(w.lower()) for w in text if w.lower() not in stopwords_list and not (w.isalpha() and len(w)==1)]
        text_bigrams = ["_".join(tup) for tup in nltk.ngrams(text,2)]
        text_final = text + text_bigrams
        return text_final

#Applying the above function to the dataframe, makimg a new column with the processed text in it
#Saving the resulting file as pkl file
np_data['Processed_text']=np_data['Filename'].apply(process_text)
np_data.to_pickle("/home/felicia/News_classifier/classifier_data.pkl")


In [3]:
df = pd.read_pickle('./classifier_data.pkl')

Looking at the dataframe, the most important columns are "Processed_text" and v9_major_rec

In [4]:
df.head()

Unnamed: 0,Hits,Score,ScorePercent,Filename,V3,Size,WordCt,Title,headline,length,...,V42,V43,V44,filename2,v9_major,dubbel,in,Processed_text,topic,v9_major_rec
0,1,1,11,/home/felicia/News_classifier/articles/#100 @3...,#100 @328419 +2073,2073,318,1899 of 1983 DOCUMENTS NRC Handelsblad January...,"1899 of 1983 DOCUMENTS;;;; January 16, 1999;;;...",LENGTH: 285 words,...,,,,0,99,0,1,"[wereld, zondagmorg, antropolog, dr, mattijs, ...",Anders/ diversen,99
1,3,3,20,/home/felicia/News_classifier/articles/#10000 ...,#10000 @16311398 +11159,11159,1775,171 of 2386 DOCUMENTS De Telegraaf 6 december ...,171 of 2386 DOCUMENTS;;;; De Telegraaf;; 6 dec...,LENGTH: 1693 woorden,...,1.0,2.0,2.0,0,16,0,1,"[samenvattingd, speurtocht, efraim, zuroff, na...",Defensie,16
2,3,3,20,/home/felicia/News_classifier/articles/#10004 ...,#10004 @16327565 +1346,1346,185,175 of 2386 DOCUMENTS De Telegraaf 5 december ...,175 of 2386 DOCUMENTS;;;;;;;;;;;;,LENGTH: 141 woorden,...,,,,0,16,0,1,"[vol, verwacht, klopp, hartjes, onz, stoer, ma...",Defensie,16
3,1,1,12,/home/felicia/News_classifier/articles/#10004 ...,#10004 @34719038 +2546,2546,368,"1142 of 1781 DOCUMENTS de Volkskrant May 1, 20...","1142 of 1781 DOCUMENTS;;;; May 1, 2001;; Werkg...",LENGTH: 349 words,...,,,,0,5,0,1,"[nieuwkomer, bedrijf, denkt, flink, salaris, g...",Arbeid,5
4,1,1,11,/home/felicia/News_classifier/articles/#10005 ...,#10005 @37431676 +6256,6256,989,"1189 of 1778 DOCUMENTS NRC Handelsblad May 7, ...","1189 of 1778 DOCUMENTS;;;; May 7, 2001;; 'Tijd...",LENGTH: 955 words,...,,,,0,3,0,1,"[ziekenhuiz, hengelo, leeuward, vandag, gestaa...",Gezondheid,3


In [63]:
df = df[df.v9_major_rec != " "] #filtering out rows with empty topic variable

In [64]:
len(df) #total number of annotated articles

11124

In [65]:
#Since I was not interested in very finegrained, small topics but rather 9-10 larger ones 
#I recoded the topic variable by using a dictionary and my own function
recode = {'Binnenland':['13','14','20', '3', '4', '5', '6'], 'Buitenland':['16', '19', '2'], 'Economie':['1','15'], 'Milieu':['8', '7'],  'Wetenschap':['17'], 'Immigratie':['9'],  'Justitie':['12'], 'Sport':['29'], 'Entertainment':['23'], 'Anders':['10','99']}

In [66]:
def recode_topics(number):
    for key, value in recode.items():
        if number in value:
            result = key
    return result

In [67]:
df['topic'] = df['v9_major_rec'].apply(recode_topics)

In [68]:
df['topic'].value_counts() #amount of different topics in the dataframe

Binnenland       2500
Buitenland       1831
Anders           1670
Justitie         1201
Entertainment    1043
Economie         1036
Sport            1029
Wetenschap        427
Milieu            235
Immigratie        152
Name: topic, dtype: int64

In [69]:
y = df.topic #making a separate variable out of the topic column
df = df.drop('topic', axis = 1) #dropping the topic column from the dataset (since it cannot be part of the test dataset)

In [70]:
df['Processed_text']=[" ".join(text) for text in df['Processed_text'].values] #processed text was in a list, now I again need it as a string

In [71]:
X_train, X_test, y_train, y_test = train_test_split(df['Processed_text'], y, test_size=0.2) #doing a test-train split

In [72]:
tfidf_vectorizer = TfidfVectorizer() #initializing a vectorizer
tfidf_train = tfidf_vectorizer.fit_transform(X_train) #training the vectorizer
with open('vectorizer.pkl', 'wb') as fin: #saving the vectorizer (in case I need it later again and do not want to retrain it)
    joblib.dump(tfidf_vectorizer, fin)


In [73]:
tfidf_test = tfidf_vectorizer.transform(X_test) #transforming the test data with the same vectorizer 

In [74]:
linear_clf = PassiveAggressiveClassifier(n_iter=50) #initializing the classifier

In [75]:
linear_clf.fit(tfidf_train, y_train) #running the model



PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
              early_stopping=False, fit_intercept=True, loss='hinge',
              max_iter=None, n_iter=50, n_iter_no_change=5, n_jobs=None,
              random_state=None, shuffle=True, tol=None,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [76]:
pred = linear_clf.predict(tfidf_test) #predicting the scores
score = metrics.accuracy_score(y_test, pred) #getting metrics to evaluate
print("accuracy:   %0.3f" % score) #overall accuracy

accuracy:   0.691


In [77]:
#extra gimmick: making a classification report (only to make it look pretty and 
#get a nice overview, not necessary for the analysis)
a = classification_report(y_test, pred, target_names = ['Binnenland', 'Buitenland', 'Economie', 'Milieu', 'Wetenschap', 'Justitie', 'Immigratie', 'Sport', 'Entertainment', 'Anders'])

In [78]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.utils.multiclass import unique_labels
def classification_report_pandas(ground_truth,
                                            predictions):
    """
    Saves the classification report to csv using the pandas module.
    :param ground_truth: list: the true labels
    :param predictions: list: the predicted labels
    """
    labels = unique_labels(ground_truth, predictions)

    precision, recall, f_score, support = precision_recall_fscore_support(ground_truth,
                                                                          predictions,
                                                                          labels=labels,
                                                                          average=None)
    results_pd = pd.DataFrame({"topic": labels,
                               "f_score": f_score,
                               'precision': precision,
                               'recall':recall,
                               })
    return results_pd

In [79]:
df2 = classification_report_pandas(y_test, pred)

In [80]:
b = y.value_counts()

In [81]:
b = b.to_frame()

In [82]:
b['index'] = b.index

In [83]:
final = pd.merge(df2, b, left_on='topic', right_on='index')
final = final.drop('index', axis = 1)
final.rename(columns = {'topic_y':'number', 'topic_x':'topic'}, inplace = True)

In [84]:
final #printing the evaluation metrics for each topic and how often it appeared in the dataset

Unnamed: 0,topic,f_score,precision,recall,number
0,Anders,0.643857,0.68543,0.607038,1670
1,Binnenland,0.679962,0.640429,0.724696,2500
2,Buitenland,0.69697,0.663462,0.734043,1831
3,Economie,0.693976,0.727273,0.663594,1036
4,Entertainment,0.692708,0.675127,0.71123,1043
5,Immigratie,0.521739,0.75,0.4,152
6,Justitie,0.688742,0.696429,0.681223,1201
7,Milieu,0.412698,0.764706,0.282609,235
8,Sport,0.895787,0.893805,0.897778,1029
9,Wetenschap,0.493333,0.528571,0.4625,427


In [87]:
with open('topic_classifier.pkl', 'wb') as fid: #saving the classifier in a file (so I do not have to retrain it again)
    joblib.dump(linear_clf, fid)  