<div class="alert alert-info" style="background-color:#5d3a8e; color:white; padding:0px 10px; border-radius:5px;"><h1 style='margin:10px 5px'> 
Master Thesis Yannik Haller - Bigram and Trigram Models
</h1>
</div>

<div class="alert alert-info" style="background-color:#5d3a8e; color:white; padding:0px 10px; border-radius:5px;"><h2 style='margin:10px 5px'> 
1. Load required packages and the data
</h2>
</div>

In [1]:
# Import required baseline packages
import re
import os
import glob
import time
import sys
import pandas as pd
import numpy as np
from pprint import pprint

# Change pandas' setting to print out long strings
pd.options.display.max_colwidth = 200

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Spacy (for lemmatization)
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim (optional)
import logging
logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s', level = logging.ERROR)

import warnings
warnings.filterwarnings("ignore", category = DeprecationWarning)
warnings.filterwarnings("ignore", category = FutureWarning)

  def _figure_formats_changed(self, name, old, new):


In [2]:
# Set the appropriate working directory
os.chdir('D:\\Dropbox\\MA_data')

In [3]:
# Define a function to read in the fully preprocessed data
def read_preprocessed(language, tokenize = True):
    # Raise an error if an inadmissible language is chosen
    allowed_languages = ['de', 'en', 'fr', 'it']
    if language not in allowed_languages:
        raise ValueError("Invalid language. Expected one of: %s" % allowed_languages)
    
    # Set the appropriate working directory
    os.chdir('D:\\Dropbox\\MA_data')

    # Define the name of the file to load
    filename = "Preprocessed/"+language+"_preprocessed.csv"

    # Read in the dataframe containing the text data
    tx_pp = pd.read_csv(filename, index_col = 0, dtype = {'tx': object})

    # Get the articles' index together with an enumeration to identify their position in the list of precleaned articles
    idx = tx_pp.index
    idx = pd.DataFrame(idx, columns = [language+'_idx'])

    # Reduce the dataframe to a list containing the text data
    tx_pp = tx_pp.tx.to_list()

    # Tokenize the data again if tokenize = True (RAM-saving)
    if tokenize:
        tx_pp = retokenize(tx_pp)

    # Return the preprocessed data
    return tx_pp, idx

# Define a function to retokenize the preprocessed text data (RAM-saving)
def retokenize(article_list):
    for i in range(len(article_list)):
        temp_tx = str(article_list[i]).split()
        article_list[i] = temp_tx
    return article_list

In [4]:
# Read in the preprocessed data
de_tx, de_idx = read_preprocessed('de')

# Take a look at the size of the precleaned data
sys.getsizeof(de_tx)

15474568

In [5]:
# Take a look at the preprocessed data
de_tx[0][:6]

['rückkehrer', 'stefan', 'meier', 'überragen', 'flames', 'herisau']

In [6]:
# Take a look at the dataframe containing the according index (i.e. article id)
de_idx.tail(3)

Unnamed: 0,de_idx
1934310,2441180
1934311,2441181
1934312,2441182


In [7]:
# Retrieve the location of the article in the preprocessed data using the according article id
article_ids = [2441180, 2441181]
location = de_idx[de_idx.de_idx.isin(article_ids)].index.tolist() #1934310

# Access the preprocessed text from the articles with the article ids in [2441180, 2441181]
#list(de_tx[i] for i in location)

# Look at the according location of the articles with the article ids in [2441180, 2441181]
location

[1934310, 1934311]

<div class="alert alert-info" style="background-color:#5d3a8e; color:white; padding:0px 10px; border-radius:5px;"><h2 style='margin:10px 5px'> 
2. Build bigram model
</h2>
</div>

In [8]:
# Build bigrams while keeping track of the processing time
t = time.time()
# Set up a loop to go through all articles
for n in range(len(de_tx)):
    # Create a temporary storage to store the bigrams of the currently processed article
    temp_tx = []
    # Set up a loop to go through all tokens of the article and create bigrams
    for i in range(len(de_tx[n]) - 1):
        temp_tx.append(de_tx[n][i] + "_" + de_tx[n][i+1])
    # Overwrite the list of unigrams of the focal article with the corresponding list of bigrams
    de_tx[n] = temp_tx
print("Processing time to build bigrams :", str((time.time() - t)/60), "minutes")

Processing time to build bigrams : 5.357060194015503 minutes


In [9]:
# Take a look at the first few phrases of the first bigrammed article
de_tx[0][:6]

['rückkehrer_stefan',
 'stefan_meier',
 'meier_überragen',
 'überragen_flames',
 'flames_herisau',
 'herisau_bangen']

In [10]:
## Save the phrased text data to a csv file
# Generate a list containing the phrased text data in form of strings in which all words and phrases are contained and separated by a blank (such that it's easy to read in later)
de_tx_out = []
for article in de_tx:
    de_tx_out.append(" ".join(article))

# Create a correctly indexed dataframe containing the preprocessed data in a column and export it as a csv file
pd.DataFrame(de_tx_out, index = de_idx.de_idx, columns = ['tx']).to_csv("Preprocessed/Phrased/de_phrased.csv", index = True, encoding = 'utf-8-sig')