## DS-F5 Analysis of Big Data 
### German news article classification

Author: Tanay Tunçer




## Getting started

In [None]:
import os
from tqdm import tqdm 

#Data preprocessing
import numpy as np
import pandas as pd 
from sklearn.metrics.pairwise import cosine_similarity
from python.text_preprocessing import extract_nouns, remove_stopwords, drop_rows, stemming_text, split_data, remove_stopwords_punct
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.tokenize import RegexpTokenizer

#Data visualization
from python.data_visualization import bar_chart, histogram, get_top_n_gram, confusion_matrix_plot
import plotly_express as px

#Machine Learning
from sklearn.naive_bayes import MultinomialNB, ComplementNB, GaussianNB

#Quality metrics
from sklearn.metrics import classification_report, f1_score

#!python -m spacy download de_core_news_sm

In [None]:
path = "data/input/raw/"

for file in os.listdir(path):
    if file.endswith("csv"):
        news_df = pd.read_csv(path + file)
    else:
        continue

In [None]:
news_df.info()

In [None]:
news_df.head(3)

In [None]:
news_df.isnull().sum()

In [None]:
n_duplicates = news_df["text"].duplicated().sum()
print(f"{n_duplicates} duplicates")


In [None]:
news_df = drop_rows(news_df, "text", news_df.columns[1:])

In [None]:
news_df.head()

## Data pre-processing

In [None]:
start_data_preprocessing = False

if start_data_preprocessing:
    """
    If start_data_preprocessing == True then 

        1) For corpus_s1 start step 1, 3, 2 and 4.
        2) For corpus_s2 start step 1,2, 5 and 6.
        3) For corpus_ start step 1, 2 and 5
    
    For testing purposes use line 12 and 13.

    Else if start_data_preprocessing == False then
        Pre-processed data is loaded.
    """
    corpus_s1 = news_df[:10].copy()
    corpus_s2 = news_df[:10].copy()

    #corpus_s1 = news_df.copy()
    #corpus_s2 = news_df.copy()

    print("Data preprocessing is started ..")    
    corpus_s1["text"] = corpus_s1["text"].map(lambda x: x.lower())
    print("Step 1 is finished") 

    corpus_s1["text"] = corpus_s1["text"].map(lambda x: RegexpTokenizer(r"\w+").tokenize(x))
    print("Step 2 is finished") 

    #corpus_s1["text"] = corpus_s1["text"].apply(lambda x: stemming_text(x))
    #print("Step 3 is finished")      

    #corpus_s1["text"] = corpus_s1["text"].apply(lambda x: remove_stopwords_punct(x))
    #print("Step 4 is finished")      

    corpus_s2["text"] = corpus_s1["text"].map(lambda x: remove_stopwords(x))
    print("Step 5 is finished")    

    corpus_s2["noun"] = corpus_s2["text"].apply(lambda x: extract_nouns(x))
    print("Step 6 is finished")   
else: 
    corpus_s1 = pd.read_csv("data/input/clean/corpus_s1.csv")
    corpus_s2 = pd.read_csv("data/input/clean/corpus_s2.csv")
    corpus_ = pd.read_csv("data/input/clean/corpus_.csv")


## Exploratory Data Analysis

In [None]:
descriptive_summary = pd.DataFrame(news_df["variable"].value_counts().reset_index())
descriptive_summary["distribution"] =  np.round((descriptive_summary["count"] / descriptive_summary["count"].sum()) *100,2) 

bar_chart(
    x = descriptive_summary["count"], 
    y = descriptive_summary["variable"], 
    title = "Anzahl der Nachrichtenartikel je Klasse", 
    )

In [None]:
bar_chart(
    x = descriptive_summary["distribution"], 
    y = descriptive_summary["variable"], 
    title = "Proz. Verteilung der Nachrichtenartikel je Klasse", 
    )

In [None]:
news_df["word_count"] = news_df["text"].apply(lambda x: len(str(x).split()))
histogram(news_df, "word_count", 100, "Anzahl der Wörter in einem Artikel")

In [None]:
class_names = news_df["variable"].unique()

for class_name in class_names: 
    histogram(news_df[news_df["variable"] == class_name], "word_count", 100, f"Anzahl der Wörter in einem Artikel der Kategorie {class_name}")

In [None]:
def plot_top_ngram(df, class_name, n_min = 2, n_max = 3, top_n = 10):

    filtered_df = df[df["variable"] == class_name]
    n_grams = get_top_n_gram(filtered_df["text"],  n_words = top_n, n_min = n_min, n_max = n_max)
    top_n_grams = pd.DataFrame(n_grams, columns = ["n_gram", "count"])

    bar_chart(top_n_grams["count"], top_n_grams["n_gram"], f"Top {top_n} Wörter der Kategorie {class_name}")


In [None]:
for class_name in class_names:
    plot_top_ngram(corpus_s1, class_name, top_n = 10)

In [None]:
news_df["word_count"].describe()

In [None]:
for class_name in class_names:
    print(class_name)
    print(news_df[news_df["variable"] == class_name]["word_count"].describe())
    print(" ")

In [None]:
n_grams = get_top_n_gram(news_df["text"],  n_words = 15, n_min = 2, n_max = 3)
top_n_grams = pd.DataFrame(n_grams, columns = ["n_gram", "count"])

bar_chart(top_n_grams["count"], top_n_grams["n_gram"], f"Top 15 Wörter im Korpus")

In [None]:
def plot_top_words(df, class_name, top = 10):
    
    filtered_df = df[df["variable"] == class_name]
    top_words = pd.Series(' '.join(filtered_df["text"]).split()).value_counts()[:top]
    top_words = pd.DataFrame(top_words, columns = ["count"]).reset_index().rename(columns = {"index":"word"})

    bar_chart(x = top_words["count"], y = top_words["word"], title = f"Top {top} Wörter der Kategorie {class_name}")

    return top_words


In [None]:
for class_name in class_names:
    plot_top_words(corpus_, class_name, top = 10)

In [None]:
#Cosine Similarity Matrix

#filtered_corpus = corpus_s1[corpus_s1["text"].apply(lambda x: len(x.split()) >= 200 and len(x.split()) <= 399)]

articles = corpus_s1["text"]
class_names = corpus_s1["variable"]
class_name = class_names.unique()

tfidf_vectorizer = TfidfVectorizer(lowercase=False)
tfidf_matrix = tfidf_vectorizer.fit_transform(articles)

mean_cosine_similarity = pd.DataFrame(index=class_name, columns=class_name)

for t in tqdm(range(100)):
    for i, class1 in enumerate(class_name): 
        for j, class2 in enumerate(class_name):

            mask1 = (class_names == class1)
            mask2 = (class_names == class2)
        
            similarity_matrix = cosine_similarity(tfidf_matrix[mask1], tfidf_matrix[mask2])
            
            mean_cosine_similarity.loc[class1, class2] = similarity_matrix[i, j].mean()
        

In [None]:
mean_cosine_similarity

In [None]:
px.imshow(mean_cosine_similarity, range_color= [-1,1], width=1000, height = 1000, color_continuous_scale=["red", "grey", "blue"])

## Baselines

In [None]:
X_bow_s1, y_bow_s1, X_train_bow_s1, X_val_bow_s1, X_test_bow_s1, y_train_bow_s1, y_val_bow_s1, y_test_bow_s1 = split_data(corpus_s1["text"], corpus_s1["variable"], emb = True, imbalanced = False)

baseline_v1 = GaussianNB()
baseline_v1.fit(X_train_bow_s1, y_train_bow_s1)

y_pred = baseline_v1.predict(X_test_bow_s1)
f1 = np.round(f1_score(y_test_bow_s1, y_pred, average="weighted"), 3)

print(f"Baseline model with f1 score of {f1}.")

In [None]:
X_tfidfv1_s1, y_tfidfv1_s1, X_train_tfidfv1_s1, X_val_tfidfv1_s1, X_test_tfidfv1_s1, y_train_tfidfv1_s1, y_val_tfidfv1_s1, y_test_tfidfv1_s1 = split_data(corpus_s1["text"], corpus_s1["variable"], emb = False, imbalanced = False)

baseline_v2 = GaussianNB()
baseline_v2.fit(X_train_tfidfv1_s1, y_train_tfidfv1_s1)

y_pred = baseline_v2.predict(X_test_tfidfv1_s1)
f1 = np.round(f1_score(y_test_tfidfv1_s1, y_pred, average="weighted"), 3)

print(f"Baseline model with f1 score of {f1}.")

In [None]:
X_bow_s2, y_bow_s2, X_train_bow_s2, X_val_bow_s2, X_test_bow_s2, y_train_bow_s2, y_val_bow_s2, y_test_bow_s2 = split_data(corpus_s2["noun"], corpus_s2["variable"], emb = True, imbalanced = False)

baseline_v3 = GaussianNB()
baseline_v3.fit(X_train_bow_s2, y_train_bow_s2)

y_pred = baseline_v3.predict(X_test_bow_s2)
f1 = np.round(f1_score(y_test_bow_s2, y_pred, average="weighted"), 3)

print(f"Baseline model with f1 score of {f1}.")

In [None]:
X_tfidfv1_s2, y_tfidfv1_s2, X_train_tfidfv1_s2, X_val_tfidfv1_s2, X_test_tfidfv1_s2, y_train_tfidfv1_s2, y_val_tfidfv1_s2, y_test_tfidfv1_s2 = split_data(corpus_s2["noun"], corpus_s2["variable"], emb = False, imbalanced = False)

baseline_v4 = GaussianNB()
baseline_v4.fit(X_train_tfidfv1_s2, y_train_tfidfv1_s2)

y_pred = baseline_v4.predict(X_test_tfidfv1_s2)
f1 = np.round(f1_score(y_test_tfidfv1_s2, y_pred, average="weighted"), 3)

print(f"Baseline model with f1 score of{f1}.")

In [None]:
X_tfidfv2_s1, y_tfidfv2_s1, X_train_tfidfv2_s1, X_val_tfidfv2_s1, X_test_tfidfv2_s1, y_train_tfidfv2_s1, y_val_tfidfv2_s1, y_test_tfidfv2_s1 = split_data(corpus_s1["text"], corpus_s1["variable"], emb = False, imbalanced = True)

baseline_v5 = GaussianNB()
baseline_v5.fit(X_train_tfidfv2_s1, y_train_tfidfv2_s1)

y_pred = baseline_v5.predict(X_test_tfidfv2_s1)
f1 = np.round(f1_score(y_test_tfidfv2_s1, y_pred, average = "weighted"), 3)

print(f"Baseline model with f1 score of {f1}.")

### Model Evaluation

In [None]:
# Gaussian Naive Bayes

gnb_parameter = {"var_smoothing": [1e-10, 1e-06, 1e-1, .1, .33, .66, 1]}

gnb_results = np.zeros((len(gnb_parameter["var_smoothing"]), 3))

n_iter = 0
for var_smoothing in gnb_parameter["var_smoothing"]:

    gnb = GaussianNB(var_smoothing = var_smoothing)
    gnb.fit(X_tfidfv2_s1, y_tfidfv2_s1)

    y_val_pred = gnb.predict(X_val_tfidfv2_s1)
    gnb_f1 = np.round(f1_score(y_val_tfidfv2_s1, y_val_pred, average="weighted"), 3)

    gnb_results[n_iter] = [n_iter, var_smoothing, gnb_f1]

    n_iter += 1

    print(f"Model {n_iter}: with f1-score of {gnb_f1} is complemted.")


In [None]:
best_parameter_idx = np.argmax(gnb_results[:,2])
best_parameter_value = gnb_results[best_parameter_idx,1]

gnb = GaussianNB(var_smoothing = best_parameter_value)
gnb.fit(X_tfidfv2_s1, y_tfidfv2_s1)

y_pred = gnb.predict(X_test_tfidfv2_s1)
print(np.round(f1_score(y_test_tfidfv2_s1, y_pred, average="weighted"), 3))


In [None]:
print(classification_report(y_test_tfidfv2_s1, y_pred))

In [None]:
confusion_matrix_plot(y_test_tfidfv2_s1, y_pred)

In [None]:
# Multinominal Naive Bayes

mnb_parameter = {"alpha": [1e-10, 1e-06, 1e-1, .1, .33, .66, 1]}
mnb_results = np.zeros((len(mnb_parameter["alpha"]), 3))

n_iter = 0

for alpha in mnb_parameter["alpha"]:

    mnb = MultinomialNB(alpha = alpha, fit_prior = True)
    mnb.fit(X_tfidfv2_s1, y_tfidfv2_s1)

    y_val_pred = mnb.predict(X_val_tfidfv2_s1)
    mnb_f1 = np.round(f1_score(y_val_tfidfv2_s1, y_val_pred, average = "weighted"), 3)

    mnb_results[n_iter] = [n_iter, alpha, mnb_f1]
    n_iter += 1

    print(f"Model {n_iter} with F1-score of {mnb_f1} is completed.")



In [None]:
best_parameter_idx = np.argmax(mnb_results[:,2])
best_parameter_value = mnb_results[best_parameter_idx,1]

mnb = MultinomialNB(alpha = best_parameter_value, fit_prior = True)
mnb.fit(X_tfidfv2_s1, y_tfidfv2_s1)

y_pred = mnb.predict(X_test_tfidfv2_s1)
print(np.round(f1_score(y_test_tfidfv2_s1, y_pred, average="weighted"), 3))


In [None]:
print(classification_report(y_test_tfidfv2_s1, y_pred))

In [None]:
confusion_matrix_plot(y_test_tfidfv2_s1, y_pred)

In [None]:
#Complete Naive Bayes

cnb_parameter = {"alpha": [1e-10, 1e-06, 1e-1, .1, .33, .66, 1]} 
cnb_results = np.zeros((len(cnb_parameter["alpha"]), 3))

n_iter = 0
for alpha in cnb_parameter["alpha"]:

    cnb = ComplementNB(alpha = alpha, norm = False)
    cnb.fit(X_tfidfv1_s1, y_tfidfv1_s1)

    y_val_pred = cnb.predict(X_val_tfidfv1_s1)
    cnb_f1 = np.round(f1_score(y_val_tfidfv1_s1, y_val_pred, average="weighted"), 3)

    cnb_results[n_iter] = [n_iter, alpha, cnb_f1]
    n_iter += 1

    print(f"Model {n_iter} with f1 score of {cnb_f1} is completed.")

In [None]:
best_parameter_idx = np.argmax(cnb_results[:,2])
best_parameter_value = cnb_results[best_parameter_idx,1]

cnb = ComplementNB(alpha = best_parameter_value, norm = False)
cnb.fit(X_tfidfv1_s1, y_tfidfv1_s1)

y_pred = cnb.predict(X_test_tfidfv1_s1)
print(np.round(f1_score(y_test_tfidfv1_s1, y_pred, average="weighted"), 3))


In [None]:
print(classification_report(y_test_tfidfv1_s1, y_pred))

In [None]:
confusion_matrix_plot(y_test_tfidfv1_s1, y_pred)
