# This notebook shows the creation of simple classifiers using the TF-IDF measure

# Library import

In [1]:
from sklearn import model_selection, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import ensemble

import xgboost, string, re
import pandas as pd
import nltk
from nltk.corpus import stopwords
import simplemma

import warnings
warnings.filterwarnings('ignore')
import pickle

# Read data, quick analysis

In [2]:
df=pd.read_csv('data.csv',usecols=['text', 'label'], encoding='utf-8')
df.head()

Unnamed: 0,text,label
0,"Company name, Murr, (address). Bestellt als Vo...",0
1,"Register number: Company name, address. Gesell...",0
2,"Company name, Hamburg, address. GeschĂ¤ftsansc...",0
3,"Register number: Company name, Bochum, address...",1
4,"Company name, address. GeschĂ¤ftsanschrift: ad...",0


In [3]:
df.shape

(11539, 2)

In [4]:
len(df.loc[df.label==1])

4952

In [5]:
df.loc[df.label==0].text[0]

'Company name, Murr, (address). Bestellt als Vorstand: Firstname, Surname, city, year, einzelvertretungsberechtigt mit der Befugnis, im Namen der Gesellschaft mit sich als Vertreter eines Dritten RechtsgeschĂ¤fte abzuschlieĂźen. Aufgrund Umschreibungsfehlers Vertretungsbefugnis bei der Befreiung von Â§ 181 BGB von Amts wegen berichtigt bei Vorstand:Firstname, Surname, city, year, einzelvertretungsberechtigt mit der Befugnis, im Namen der Gesellschaft mit sich als Vertreter eines Dritten RechtsgeschĂ¤fte abzuschlieĂźen. InlĂ¤ndische GeschĂ¤ftsanschrift: address'

In [6]:
df.loc[df.label==1].text[11534]

'Register number: Company name,, Sitz vormals: address. Der Sitz der Gesellschaft ist nach Kiel verlegt (Amtsgericht Kiel, HRB 20813 KI). Neue Geschäftsanschrift: new address.'

In [7]:
df=df.convert_dtypes()
df.dtypes

text     string
label     Int64
dtype: object

# Data normalization

In [8]:
def preprocessing(data, column_name:str, stopwords_language:str):
    """
    This function preprocess insert german text data, doing:
    * drop stopwords form german
    * create column with zip and cities
    * create column with date of birth
    * drop punctation
    * lematization of words in text
    :param data dataset with text in german:
    :return cleared data:
    """
    #rid off punctation
    data['cleared text'] = data[column_name].str.replace('[^\w\s]', '', regex = True)
    
    #lematization
    langdata = simplemma.load_data('de')
    data['cleared text'] = data['cleared text'].apply(lambda x: " ".join([simplemma.lemmatize(word, langdata) for word in x.split()]))
    
    # rid off stopwords
    nltk.download('stopwords')
    stop = stopwords.words(stopwords_language)
    data['cleared text'] = data['cleared text'].apply(lambda x: " ".join(x.lower() for x in x.split() if x not in stop))
    
    return data

In [9]:
df=preprocessing(df, 'text', 'german')
df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rdjf\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,text,label,cleared text
0,"Company name, Murr, (address). Bestellt als Vo...",0,company name murren address bestellt vorstand ...
1,"Register number: Company name, address. Gesell...",0,register number company name address gesellsch...
2,"Company name, Hamburg, address. GeschĂ¤ftsansc...",0,company name hamburg address geschăftsanschrif...
3,"Register number: Company name, Bochum, address...",1,register number company name bochum address ge...
4,"Company name, address. GeschĂ¤ftsanschrift: ad...",0,company name address geschăftsanschrift addres...


In [10]:
df.loc[df.label==0]['cleared text'][0]

'company name murren address bestellt vorstand firstname surname city year einzelvertretungsberechtigt befugnis name gesellschaft er|es|sie vertreter dritte rechtsgeschăfte abzuschlieăźen aufgrund umschreibungsfehlers vertretungsbefugnis befreiung â 181 bgb amt wegen berichtigt vorstandfirstname surname city year einzelvertretungsberechtigt befugnis name gesellschaft er|es|sie vertreter dritte rechtsgeschăfte abzuschlieăźen inlăndische geschăftsanschrift address'

In [11]:
df.loc[df.label==1]['cleared text'][11534]

'register number company name sitz vormals address sitz gesellschaft kiel vorverlegen amtsgericht kiel hrb 20813 ki neu geschäftsanschrift new address'

# Create TF_IDF

In [12]:
# split the dataset into training and validation datasets 
train_x, test_x, train_y, test_y = model_selection.train_test_split(df['text'], df['label'],random_state=123)

In [13]:
#change type of data, idk why classifer read it like str
train_y=train_y.astype(int)
test_y=test_y.astype(int)

In [14]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(df['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xtest_tfidf =  tfidf_vect.transform(test_x)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(df['text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xtest_tfidf_ngram =  tfidf_vect_ngram.transform(test_x)

In [15]:
pickle.dump(tfidf_vect, open(f'TFIDF.pkl', 'wb'))

# Function to train calssificators

In [16]:
def train_model(classifier, feature_vector_train, label, feature_vector_vtest, test_y,is_neural_net=False,save=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # Save the trained model as a pickle string.
    if save:
        pickle.dump(classifier, open(f'{str(classifier).split("(")[0]}.pkl', 'wb'))
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_vtest)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)

    return metrics.accuracy_score(predictions, test_y)

# Naive Bayes

In [17]:
# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xtest_tfidf, test_y, save = True)
print ("NB, WordLevel TF-IDF: ", accuracy)

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram, test_y)
print ("NB, N-Gram Vectors: ", accuracy)

NB, WordLevel TF-IDF:  0.9102253032928943
NB, N-Gram Vectors:  0.9206239168110919


# Linear Regression

In [18]:
# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(random_state=123), xtrain_tfidf, train_y, xtest_tfidf, test_y, save = True)
print ("LR, WordLevel TF-IDF: ", accuracy)

# Linear Classifier on Ngram Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(random_state=123), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram,test_y)
print ("LR, N-Gram Vectors: ", accuracy)

LR, WordLevel TF-IDF:  0.9487001733102253
LR, N-Gram Vectors:  0.9608318890814558


# SVM

In [19]:
# SVM on Word Level TF IDF Vectors
accuracy = train_model(svm.SVC(random_state=123), xtrain_tfidf, train_y, xtest_tfidf, test_y, save = True)
print ("SVM, WordLevel TF-IDF:", accuracy)

# SVM on Ngram Level TF IDF Vectors
accuracy = train_model(svm.SVC(random_state=123), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram, test_y)
print ("SVM, N-Gram Vectors: ", accuracy)

SVM, WordLevel TF-IDF: 0.963258232235702
SVM, N-Gram Vectors:  0.9688041594454073


# Random Forest 
Random Forest has the possibility to show which words the model pays special attention to when classifying

In [20]:
def train_model_RF(classifier, feature_vector_train, label, feature_vector_vtest, test_y, is_neural_net = False, save = False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # Save the trained model as a pickle string.
    if save:
        pickle.dump(classifier, open(f'{str(classifier).split("(")[0]}.pkl', 'wb'))
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_vtest)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    importances = classifier.feature_importances_
    return metrics.accuracy_score(predictions, test_y),importances

In [21]:
# RF on Word Level TF IDF Vectors #wizualziacja drzewa
accuracy,importances= train_model_RF(ensemble.RandomForestClassifier(n_estimators=50,random_state=123), xtrain_tfidf, train_y, xtest_tfidf, test_y, save = True)
print ("RF, WordLevel TF-IDF: ", accuracy)
# RF on Ngram Level TF IDF Vectors
accuracy,im2 = train_model_RF(ensemble.RandomForestClassifier(n_estimators=50,random_state=123), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram, test_y)
print ("RF, N-Gram Vectors: ", accuracy)

RF, WordLevel TF-IDF:  0.9809358752166378
RF, N-Gram Vectors:  0.9660311958405546


In [23]:
# TF-IFD model importance of words
df_importance=pd.DataFrame({'Word_tfidf':tfidf_vect.get_feature_names(), 'importance':importances})
df_importance.sort_values(by = ['importance'], ascending = False)[:5] 

Unnamed: 0,Word_tfidf,importance
2314,geschäftsanschrift,0.16623
4974,änderung,0.026122
1695,die,0.023965
3461,nach,0.02294
4545,verlegt,0.017289


In [24]:
# TF-IFD n-gram model importance of words
df_importance=pd.DataFrame({'Word_ngram':tfidf_vect_ngram.get_feature_names(), 'importance':im2})
df_importance.sort_values(by = ['importance'],ascending = False)[:5]

Unnamed: 0,Word_ngram,importance
4918,zur geschäftsanschrift,0.024732
1732,der sitz,0.018045
2838,geänderte geschäftsanschrift,0.016139
3646,nach änderung,0.014738
1669,der geschäftsanschrift,0.014082


# XGBoost

In [25]:
# Extereme Gradient Boosting on Word Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), train_y, xtest_tfidf.tocsc(), test_y, save = True)
print ("Xgb, WordLevel TF-IDF: ", accuracy)

# Extereme Gradient Boosting on Ngram Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram.tocsc(), train_y, xtest_tfidf_ngram.tocsc(), test_y)
print ("Xgb, N-Gram Vectors: ", accuracy)

Xgb, WordLevel TF-IDF:  0.9889081455805893
Xgb, N-Gram Vectors:  0.9746967071057192


# Check pickle
Below shows how to save and load ML models in pickle format

In [None]:
# example
import pickle
pickle.dump(classifier, open(f'name_of_file.pkl', 'wb'))
classifier=pickle.load(open('name_of_file.pkl', 'rb'))

In [26]:
# Checking whether the model has been saved correctly
pickled_model = pickle.load(open('XGBClassifier.pkl', 'rb'))
predictions=pickled_model.predict(xtest_tfidf)
metrics.accuracy_score(predictions, test_y)

0.9889081455805893

# Check publication as a string

In [27]:
def preprocessing_text(text, stopwords_language):
    #rid off punctation
    text= re.sub(r'[^\w\s]', "", text)
    lematization = ''
    
    #lematization
    langdata = simplemma.load_data('de')
    for word in text.split():
        lematization += ' '+simplemma.lemmatize(word, langdata)
        
    # rid off stopwords
    nltk.download('stopwords')
    stop = stopwords.words(stopwords_language)
    text_cleared = ''
    for word in lematization.split():
        if word not in stop:
            text_cleared += " " + word.lower()   
    return text_cleared

In [28]:
publication_change = 'Register number, company name, address. Firma geändert, nun: Neue Firma: new company name. Sitz verlegt, nun: Neuer Sitz: Witten. Geändert, nun: Geschäftsanschrift: new address.'
publication_no_change = 'Register number, company name, address. Rechtsverhaeltnis: Die Gesellschaft ist auf Grund des Verschmelzungsvertrages vom 02.07.2015 und der Zustimmungsbeschlüsse vom 20.07.2015 durch Übertragung ihres Vermögens als Ganzes unter Auflösung ohne Abwicklung auf die company name verschmolzen. Die Verschmelzung ist mit der gleichzeitig erfolgten Eintragung in das Register des Sitzes des übernehmenden Rechtsträgers wirksam geworden. Die Firma ist erloschen. Als nicht eingetragen wird veröffentlicht: Den Gläubigern der an der Verschmelzung beteiligten Rechtsträger ist, wenn sie binnen sechs Monaten nach dem Tag, an dem die Eintragung der Verschmelzung in das Register des Sitzes desjenigen Rechtsträgers, dessen Gläubiger sie sind, als bekannt gemacht gilt, ihren Anspruch nach Grund und Höhe schriftlich anmelden, Sicherheit zu leisten, soweit sie nicht Befriedigung verlangen können. Dieses Recht steht den Gläubigern jedoch nur zu, wenn sie glaubhaft machen, dass durch die Verschmelzung die Erfüllung ihrer Forderung gefährdet wird.'


In [30]:
preprocessing_text(publication_change, 'german')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rdjf\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


' register number company name address firma geändert neu firma new company name sitz vorverlegen neu sitz witten geändert geschäftsanschrift new address'

In [31]:
prediction=pickled_model.predict(tfidf_vect.transform([preprocessing_text(publication_change, 'german')]))
prediction[0]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rdjf\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


1

In [32]:
prediction=pickled_model.predict(tfidf_vect.transform([preprocessing_text(publication_no_change, 'german')]))
prediction[0]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rdjf\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0