In [1]:
import json

data = {}

for name in ["train", "dev", "test"]:
    with open(f"Data/3class/{name}.json", "r") as filedata:
        data[name] = json.load(filedata)

In [2]:
data["train"][:3]

[{'sent_id': '201911-01-01', 'text': 'Philips 190G6', 'label': 'Neutral'},
 {'sent_id': '201911-02-01',
  'text': 'Med integrerte høyttalere som på ingen måte er diskret plassert , og med en stor subwoofer inkludert , da snakker vi om en gutteskjerm .',
  'label': 'Neutral'},
 {'sent_id': '201911-02-02',
  'text': 'Eller bedrar skinnet ?',
  'label': 'Negative'}]

In [3]:
text_train = []
for row in data["train"]:
    text_train.append(row["text"])

text_test = []
for row in data["test"]:
    text_test.append(row["text"])

In [22]:
import numpy as np

train_label_list = []
for row in data["train"]:
    train_label_list.append(row["label"])
    
test_label_list = []
for row in data["test"]:
    test_label_list.append(row["label"])
    
y_train = np.array(train_label_list)
y_test = np.array(test_label_list)

print(f"Training label shape: {y_train.shape}")
print(f"Testing label shape: {y_test.shape}")

Training label shape: (7973,)
Testing label shape: (1181,)


In [62]:
import nltk
from nltk.corpus import stopwords

s_words = stopwords.words("norwegian")
print("Number of Norwegian stopswords: " + str(len(s_words)))
print(f"LIST OF STOPWORDS: {s_words}")

Number of Norwegian stopswords: 176
LIST OF STOPWORDS: ['og', 'i', 'jeg', 'det', 'at', 'en', 'et', 'den', 'til', 'er', 'som', 'på', 'de', 'med', 'han', 'av', 'ikke', 'ikkje', 'der', 'så', 'var', 'meg', 'seg', 'men', 'ett', 'har', 'om', 'vi', 'min', 'mitt', 'ha', 'hadde', 'hun', 'nå', 'over', 'da', 'ved', 'fra', 'du', 'ut', 'sin', 'dem', 'oss', 'opp', 'man', 'kan', 'hans', 'hvor', 'eller', 'hva', 'skal', 'selv', 'sjøl', 'her', 'alle', 'vil', 'bli', 'ble', 'blei', 'blitt', 'kunne', 'inn', 'når', 'være', 'kom', 'noen', 'noe', 'ville', 'dere', 'som', 'deres', 'kun', 'ja', 'etter', 'ned', 'skulle', 'denne', 'for', 'deg', 'si', 'sine', 'sitt', 'mot', 'å', 'meget', 'hvorfor', 'dette', 'disse', 'uten', 'hvordan', 'ingen', 'din', 'ditt', 'blir', 'samme', 'hvilken', 'hvilke', 'sånn', 'inni', 'mellom', 'vår', 'hver', 'hvem', 'vors', 'hvis', 'både', 'bare', 'enn', 'fordi', 'før', 'mange', 'også', 'slik', 'vært', 'være', 'båe', 'begge', 'siden', 'dykk', 'dykkar', 'dei', 'deira', 'deires', 'deim', '

In [68]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(stop_words=s_words, min_df=2)
vec.fit(text_train)
X_train = vec.transform(text_train).toarray()
X_test = vec.transform(text_test).toarray()

print(f"Vocabulary size: {len(vec.vocabulary_)}")
print(f"Every 100th word in vocabulary: {vec.get_feature_names_out()[::100]}")

Vocabulary size: 7857
Every 100th word in vocabulary: ['000' '33' 'aktive' 'angry' 'artistnavnet' 'banal' 'beite' 'beyoncé'
 'blå' 'bruk' 'cameron' 'cvt' 'demonen' 'doser' 'dyrholm' 'ekstraomganger'
 'ensemble' 'fange' 'figurer' 'flotteste' 'forholdsvis' 'forventede'
 'fullstendig' 'gate' 'gjest' 'grunnkonseptet' 'hardere' 'herbert' 'hop'
 'håpløst' 'innblikk' 'introdusert' 'johannes' 'karakteristikk' 'kjører'
 'kombinasjon' 'koring' 'kvalifisert' 'ledetråder' 'liker' 'lyde' 'mangel'
 'medmennesker' 'middager' 'morrison' 'måtte' 'ni' 'nært' 'oppfatninger'
 'oslos' 'pcen' 'poenget' 'private' 'rakk' 'replikk' 'romanen' 'sagt'
 'science' 'sette' 'sjølv' 'skrike' 'slåss' 'soul' 'spøkelsesaktig'
 'stillingtagen' 'større' 'sylskarpt' 'tankevekkende' 'tettsittende'
 'ting' 'treffende' 'tv' 'ulykkelig' 'utfor' 'vakt' 'venner' 'vis' 'who'
 'års']


In [69]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(max_iter=75, n_jobs=-1)
logreg.fit(X_train, y_train)

In [70]:
from sklearn.metrics import accuracy_score

train_pred_1 = logreg.predict(X_train)
test_pred_1 = logreg.predict(X_test)

print("Logistic Regression first try scores")
print("Training score: {:.3f}".format(accuracy_score(y_train, train_pred_1)))
print("Test score: {:.3f}".format(accuracy_score(y_test, test_pred_1)))

Logistic Regression first try scores
Training score: 0.930
Test score: 0.625


In [71]:
count_dict = {"Neutral": 0, "Positive": 0, "Negative": 0}
for label in train_pred_1:
    count_dict[label] += 1
    
display(count_dict)

{'Neutral': 4451, 'Positive': 2480, 'Negative': 1042}

In [72]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(max_depth=400, n_jobs=-1)
rfc.fit(X_train, y_train)

In [73]:
train_pred_2 = rfc.predict(X_train)
test_pred_2 = rfc.predict(X_test)

print("Random Forest scores")
print("Training score: {:.3f}".format(accuracy_score(y_train, train_pred_2)))
print("Test score: {:.3f}".format(accuracy_score(y_test, test_pred_2)))

Random Forest scores
Training score: 0.831
Test score: 0.588


In [75]:
max_depth = 0
sum_depth = 0

for e in rfc.estimators_:
    sum_depth += e.tree_.max_depth
    if(e.tree_.max_depth > max_depth):
        max_depth = e.tree_.max_depth

print(f"Maximum depth is: {max_depth}")
print(f"Average depth is: {(sum_depth/len(rfc.estimators_))}")

Maximum depth is: 200
Average depth is: 200.0


In [76]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB(alpha=1)
mnb.fit(X_train, y_train)

In [77]:
train_pred_3 = mnb.predict(X_train)
test_pred_3 = mnb.predict(X_test)

print("Multinomial Naive Bayes scores")
print("Training score: {:.3f}".format(accuracy_score(y_train, train_pred_3)))
print("Test score: {:.3f}".format(accuracy_score(y_test, test_pred_3)))

Multinomial Naive Bayes scores
Training score: 0.862
Test score: 0.607
