In [1]:
import json

data = {}

for name in ["train", "dev", "test"]:
    with open(f"Data/3class/{name}.json", "r") as filedata:
        data[name] = json.load(filedata)

FileNotFoundError: [Errno 2] No such file or directory: 'Data/3class/train.json'

In [None]:
data["train"][:3]

In [None]:
# this cell has 2 functions, one that checks language and one that translates if English
from langdetect import detect
from googletrans import Translator

translator = Translator()

# recognizes the language of the text and translates if English
def recognizeText(text):
    if detect(text) != 'no':
        return translateText(text)
    else:
        return text

# translates the text given
def translateText(text):
    translated = translator.translate(text, src='en', dest='no')
    return translated.text


In [None]:
# this cell has 2 functions, one that tokenizes the words for each sentence and one that lemmatizes the words
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('norwegian'))

# function that 'cleans' the text from the sets, tokenizes and removes stop words and punctuations
def clean_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words and token not in punctuation]
    text = ' '.join(tokens)
    return text


nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def lemmatize(text):
    text = clean_text(text)
    return lemmatizer.lemmatize(text)

In [None]:
text_train = []
for row in data["train"]:
    text = recognizeText(row["text"])
    text = lemmatize(text)
    text_train.append(text)

text_test = []
for row in data["test"]:
    text = recognizeText(row["text"])
    text_test.append(text)

In [None]:
import numpy as np

train_label_list = []
for row in data["train"]:
    train_label_list.append(row["label"])
    
test_label_list = []
for row in data["test"]:
    test_label_list.append(row["label"])
    
y_train = np.array(train_label_list)
y_test = np.array(test_label_list)

print(f"Training label shape: {y_train.shape}")
print(f"Testing label shape: {y_test.shape}")

In [None]:
import nltk
from nltk.corpus import stopwords

s_words = stopwords.words("norwegian")
print("Number of Norwegian stopswords: " + str(len(s_words)))
print(f"LIST OF STOPWORDS: {s_words}")

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(stop_words=s_words, min_df=2)
vec.fit(text_train)
X_train = vec.transform(text_train).toarray()
X_test = vec.transform(text_test).toarray()

print(f"Vocabulary size: {len(vec.vocabulary_)}")
print(f"Every 100th word in vocabulary: {vec.get_feature_names_out()[::100]}")

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(max_iter=75, n_jobs=-1)
logreg.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score

train_pred_1 = logreg.predict(X_train)
test_pred_1 = logreg.predict(X_test)

print("Logistic Regression first try scores")
print("Training score: {:.3f}".format(accuracy_score(y_train, train_pred_1)))
print("Test score: {:.3f}".format(accuracy_score(y_test, test_pred_1)))

In [None]:
count_dict = {"Neutral": 0, "Positive": 0, "Negative": 0}
for label in train_pred_1:
    count_dict[label] += 1
    
display(count_dict)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(max_depth=400, n_jobs=-1)
rfc.fit(X_train, y_train)

In [None]:
train_pred_2 = rfc.predict(X_train)
test_pred_2 = rfc.predict(X_test)

print("Random Forest scores")
print("Training score: {:.3f}".format(accuracy_score(y_train, train_pred_2)))
print("Test score: {:.3f}".format(accuracy_score(y_test, test_pred_2)))

In [None]:
max_depth = 0
sum_depth = 0

for e in rfc.estimators_:
    sum_depth += e.tree_.max_depth
    if(e.tree_.max_depth > max_depth):
        max_depth = e.tree_.max_depth

print(f"Maximum depth is: {max_depth}")
print(f"Average depth is: {(sum_depth/len(rfc.estimators_))}")

In [None]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB(alpha=1)
mnb.fit(X_train, y_train)

In [None]:
train_pred_3 = mnb.predict(X_train)
test_pred_3 = mnb.predict(X_test)

print("Multinomial Naive Bayes scores")
print("Training score: {:.3f}".format(accuracy_score(y_train, train_pred_3)))
print("Test score: {:.3f}".format(accuracy_score(y_test, test_pred_3)))