In [32]:
import json
import os

data = {}
for name in ["train", "dev", "test"]:
    file_path = os.path.join(os.pardir, "Data", "3class", f"{name}.json")
    with open(file_path) as filedata:
        data[name] = json.load(filedata)

In [33]:
data["train"][:3]

[{'sent_id': '201911-01-01', 'text': 'Philips 190G6', 'label': 'Neutral'},
 {'sent_id': '201911-02-01',
  'text': 'Med integrerte høyttalere som på ingen måte er diskret plassert , og med en stor subwoofer inkludert , da snakker vi om en gutteskjerm .',
  'label': 'Neutral'},
 {'sent_id': '201911-02-02',
  'text': 'Eller bedrar skinnet ?',
  'label': 'Negative'}]

In [34]:
# this cell contains 2 functions, one that tokenizes the words for each sentence and one that lemmatizes the words

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

punctuation = '''´«»!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~'''

stop_words = set(stopwords.words('norwegian'))

# function that 'cleans' the text from the sets, tokenizes and removes stop words and punctuations
def clean_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words and token not in punctuation]
    text = ' '.join(tokens)
    return text


nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def lemmatize(text):
    text = clean_text(text)
    return lemmatizer.lemmatize(text)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\colin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [35]:
# this cell contains 2 functions, one that checks language and one that translates if English
from langdetect import detect
from googletrans import Translator

translator = Translator()

# recognizes the language of the text and translates if English
def recognizeLanguage(text):
    try:
        if detect(text) == 'en':
            return translateText(text)
        else:
            return text
    except:
        return text

# translates the text given
def translateText(text):
    translated = translator.translate(text, src='en', dest='no')
    return translated.text


In [36]:
text_train = []
for row in data["train"]:
    text = lemmatize(row['text'])
    text = recognizeLanguage(text)
    text_train.append(text)

text_test = []
for row in data["test"]:
    text = lemmatize(row['text'])
    text = recognizeLanguage(text)
    text_test.append(text)

In [37]:
import numpy as np

train_label_list = []
for row in data["train"]:
    train_label_list.append(row["label"])
    
test_label_list = []
for row in data["test"]:
    test_label_list.append(row["label"])
    
y_train = np.array(train_label_list)
y_test = np.array(test_label_list)

print(f"Training label shape: {y_train.shape}")
print(f"Testing label shape: {y_test.shape}")

Training label shape: (7973,)
Testing label shape: (1181,)


In [38]:
from nltk.corpus import stopwords

s_words = stopwords.words("norwegian")
print("Number of Norwegian stopwords: " + str(len(s_words)))
print(f"LIST OF STOPWORDS: {s_words}")

Number of Norwegian stopwords: 176
LIST OF STOPWORDS: ['og', 'i', 'jeg', 'det', 'at', 'en', 'et', 'den', 'til', 'er', 'som', 'på', 'de', 'med', 'han', 'av', 'ikke', 'ikkje', 'der', 'så', 'var', 'meg', 'seg', 'men', 'ett', 'har', 'om', 'vi', 'min', 'mitt', 'ha', 'hadde', 'hun', 'nå', 'over', 'da', 'ved', 'fra', 'du', 'ut', 'sin', 'dem', 'oss', 'opp', 'man', 'kan', 'hans', 'hvor', 'eller', 'hva', 'skal', 'selv', 'sjøl', 'her', 'alle', 'vil', 'bli', 'ble', 'blei', 'blitt', 'kunne', 'inn', 'når', 'være', 'kom', 'noen', 'noe', 'ville', 'dere', 'som', 'deres', 'kun', 'ja', 'etter', 'ned', 'skulle', 'denne', 'for', 'deg', 'si', 'sine', 'sitt', 'mot', 'å', 'meget', 'hvorfor', 'dette', 'disse', 'uten', 'hvordan', 'ingen', 'din', 'ditt', 'blir', 'samme', 'hvilken', 'hvilke', 'sånn', 'inni', 'mellom', 'vår', 'hver', 'hvem', 'vors', 'hvis', 'både', 'bare', 'enn', 'fordi', 'før', 'mange', 'også', 'slik', 'vært', 'være', 'båe', 'begge', 'siden', 'dykk', 'dykkar', 'dei', 'deira', 'deires', 'deim', 'd

In [39]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(stop_words=s_words, min_df=2)
vec.fit(text_train)
X_train = vec.transform(text_train).toarray()
X_test = vec.transform(text_test).toarray()

print(f"Vocabulary size: {len(vec.vocabulary_)}")
print(f"Every 100th word in vocabulary: {vec.get_feature_names_out()[::100]}")

Vocabulary size: 7801
Every 100th word in vocabulary: ['000' '33' 'aktivt' 'angår' 'aserbajdsjan' 'bandets' 'bekymringsfri'
 'bieber' 'bob' 'brukt' 'cat' 'dama' 'derimot' 'dramatikk' 'dødsfallet'
 'elektrisk' 'erfaringen' 'faren' 'filmenes' 'fokuseringen' 'formen'
 'framfor' 'fylle' 'generelt' 'gleda' 'gudfaren' 'havnet' 'highway'
 'hovedpersonene' 'høyder' 'innholdsrik' 'is' 'jos' 'kategorier'
 'klassekampen' 'kommunisere' 'kraftigere' 'kvinners' 'leken' 'linjer'
 'lyset' 'manusforfatterne' 'mena' 'miljøet' 'mottar' 'nakenhet' 'noise'
 'offisielle' 'oppleves' 'overhodet' 'personen' 'pornographers'
 'prosessoren' 'real' 'reynolds' 'rumleskaft' 'sammenliknet' 'selges'
 'sikte' 'skimter' 'skyggeforfatteren' 'snarere' 'spillbarheten'
 'statsministeren' 'strait' 'supervention' 'sødahl' 'teknologien'
 'tildelt' 'tony' 'troverdig' 'tømme' 'univers' 'utseende' 'varm'
 'veteranene' 'von' 'ypperlige' 'øyvind']


In [40]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(max_iter=75, n_jobs=-1)
logreg.fit(X_train, y_train)

In [41]:
from sklearn.metrics import accuracy_score

train_pred_1 = logreg.predict(X_train)
test_pred_1 = logreg.predict(X_test)

print("Logistic Regression first try scores")
print("Training score: {:.3f}".format(accuracy_score(y_train, train_pred_1)))
print("Test score: {:.3f}".format(accuracy_score(y_test, test_pred_1)))

Logistic Regression first try scores
Training score: 0.929
Test score: 0.616


In [48]:

count_dict = {"Neutral": 0, "Positive": 0, "Negative": 0}
for label in train_pred_1:
    count_dict[label] += 1

display(count_dict)


{'Neutral': 4454, 'Positive': 2485, 'Negative': 1034}

In [43]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(max_depth=400, n_jobs=-1)
rfc.fit(X_train, y_train)

In [44]:
train_pred_2 = rfc.predict(X_train)
test_pred_2 = rfc.predict(X_test)

print("Random Forest scores")
print("Training score: {:.3f}".format(accuracy_score(y_train, train_pred_2)))
print("Test score: {:.3f}".format(accuracy_score(y_test, test_pred_2)))

Random Forest scores
Training score: 0.901
Test score: 0.599


In [45]:
max_depth = 0
sum_depth = 0

for e in rfc.estimators_:
    sum_depth += e.tree_.max_depth
    if(e.tree_.max_depth > max_depth):
        max_depth = e.tree_.max_depth

print(f"Maximum depth is: {max_depth}")
print(f"Average depth is: {(sum_depth/len(rfc.estimators_))}")

Maximum depth is: 400
Average depth is: 400.0


In [46]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB(alpha=1)
mnb.fit(X_train, y_train)

In [47]:
train_pred_3 = mnb.predict(X_train)
test_pred_3 = mnb.predict(X_test)

print("Multinomial Naive Bayes scores")
print("Training score: {:.3f}".format(accuracy_score(y_train, train_pred_3)))
print("Test score: {:.3f}".format(accuracy_score(y_test, test_pred_3)))

Multinomial Naive Bayes scores
Training score: 0.861
Test score: 0.600
