In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [ "Thor eating pizza, Loki is eating pizza, Ironman ate pizza already",
    "Apple is announcing new iphone tomorrow",
    "Tesla is announcing new model-3 tomorrow",
    "Google is announcing new pixel-6 tomorrow",
    "Microsoft is announcing new surface tomorrow",
    "Amazon is announcing new eco-dot tomorrow",
    "I am eating biryani and you are eating grapes"]

In [2]:
v = TfidfVectorizer()
transformed_output = v.fit_transform(corpus)
print(v.vocabulary_)

{'thor': 25, 'eating': 10, 'pizza': 22, 'loki': 17, 'is': 16, 'ironman': 15, 'ate': 7, 'already': 0, 'apple': 5, 'announcing': 4, 'new': 20, 'iphone': 14, 'tomorrow': 26, 'tesla': 24, 'model': 19, 'google': 12, 'pixel': 21, 'microsoft': 18, 'surface': 23, 'amazon': 2, 'eco': 11, 'dot': 9, 'am': 1, 'biryani': 8, 'and': 3, 'you': 27, 'are': 6, 'grapes': 13}


In [3]:
all_feature_names = v.get_feature_names_out()
for word in all_feature_names:
    index = v.vocabulary_.get(word)
    print(f"{word} | {v.idf_[index]}")

already | 2.386294361119891
am | 2.386294361119891
amazon | 2.386294361119891
and | 2.386294361119891
announcing | 1.2876820724517808
apple | 2.386294361119891
are | 2.386294361119891
ate | 2.386294361119891
biryani | 2.386294361119891
dot | 2.386294361119891
eating | 1.9808292530117262
eco | 2.386294361119891
google | 2.386294361119891
grapes | 2.386294361119891
iphone | 2.386294361119891
ironman | 2.386294361119891
is | 1.1335313926245225
loki | 2.386294361119891
microsoft | 2.386294361119891
model | 2.386294361119891
new | 1.2876820724517808
pixel | 2.386294361119891
pizza | 2.386294361119891
surface | 2.386294361119891
tesla | 2.386294361119891
thor | 2.386294361119891
tomorrow | 1.2876820724517808
you | 2.386294361119891


In [4]:
import pandas as pd
import spacy
reviews_df = pd.read_csv(r"C:\Users\nikit\Documents\Graduation internship\NLP\Datasets\dutch_book_reviews.csv")
reviews_df.head()

Unnamed: 0,review,sentiment,IsPositive
0,'Can love ever be stronger than fate?' dat sta...,positive,1
1,Er wordt in het verhaal gebruik gemaakt van gl...,positive,1
2,Verder vond ik het een origineel verhaal met e...,positive,1
3,Ik ben gaan meeleven met Ella en vind het erg ...,positive,1
4,Lees ook het interview met N.J. Simmons,positive,1


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(reviews_df.review, reviews_df.IsPositive, test_size= 0.2)

In [6]:
print("Shape of X_train: ",X_train.shape)
print("Shape of X_test: ", X_test.shape)

Shape of X_train:  (82516,)
Shape of X_test:  (20630,)


In [7]:
v = TfidfVectorizer()
X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<82516x103727 sparse matrix of type '<class 'numpy.float64'>'
	with 3263358 stored elements in Compressed Sparse Row format>

In [8]:
v.vocabulary_

{'schrijfster': 77056,
 'linda': 51015,
 'conrad': 17640,
 'leeft': 49336,
 'al': 4281,
 '11': 68,
 'jaar': 42296,
 'lang': 48873,
 'afgezonderd': 3494,
 'in': 40470,
 'haar': 35499,
 'villa': 95075,
 'nadat': 58122,
 'ze': 101574,
 'getraumatiseerd': 32597,
 'is': 42090,
 'geraakt': 31798,
 'doordat': 21864,
 'zus': 103140,
 'anna': 5468,
 'vermoord': 93249,
 'heeft': 36696,
 'gevonden': 32875,
 'de': 19386,
 'dader': 18853,
 'op': 63775,
 'vlucht': 95629,
 'als': 4809,
 'een': 24027,
 'schim': 76522,
 'gezien': 33176,
 'na': 57946,
 'meent': 54576,
 'hem': 37077,
 'te': 84936,
 'herkennen': 37353,
 'tv': 89023,
 'en': 25053,
 'besluit': 10052,
 'val': 90905,
 'lokken': 51481,
 'om': 60832,
 'zo': 102608,
 'bekentenis': 8880,
 'van': 91017,
 'af': 3193,
 'dwingen': 23676,
 'oei': 60497,
 'was': 98213,
 'het': 37627,
 'boek': 12442,
 'dan': 19073,
 'slecht': 79193,
 'nee': 58812,
 'maar': 52426,
 'wel': 98999,
 'ik': 40231,
 'moet': 56695,
 'beoordelen': 9523,
 'thriller': 86305,
 'dit

In [9]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [10]:
X_test_cv = v.transform(X_test)

In [11]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test_cv)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.48      0.61      8966
           1       0.70      0.93      0.80     11664

    accuracy                           0.73     20630
   macro avg       0.77      0.70      0.70     20630
weighted avg       0.76      0.73      0.72     20630



In [12]:
import spacy
nlp = spacy.load("nl_core_news_sm")
def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)

In [13]:
reviews_df['preprocessed_txt'] = reviews_df.review.apply(preprocess)
reviews_df.head()

Unnamed: 0,review,sentiment,IsPositive,preprocessed_txt
0,'Can love ever be stronger than fate?' dat sta...,positive,1,Can love ever be stronger than fate staan cove...
1,Er wordt in het verhaal gebruik gemaakt van gl...,positive,1,verhaal gebruik maken glimpen verleden hoofdst...
2,Verder vond ik het een origineel verhaal met e...,positive,1,vinden origineel verhaal interessant kijk leve...
3,Ik ben gaan meeleven met Ella en vind het erg ...,positive,1,gaan meeleav Ella vinden erg 2018 wachten wach...
4,Lees ook het interview met N.J. Simmons,positive,1,Lees interview N.J. Simmons


In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(reviews_df.preprocessed_txt, reviews_df.IsPositive, test_size= 0.2)

In [15]:
v = TfidfVectorizer()
X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<82516x99816 sparse matrix of type '<class 'numpy.float64'>'
	with 1767203 stored elements in Compressed Sparse Row format>

In [16]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [17]:
X_test_cv = v.transform(X_test)

In [18]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test_cv)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.48      0.61      8984
           1       0.70      0.93      0.80     11646

    accuracy                           0.73     20630
   macro avg       0.77      0.70      0.70     20630
weighted avg       0.76      0.73      0.72     20630



In [19]:
df_vogel = pd.read_csv(r"C:\Users\nikit\Documents\Graduation internship\NLP\Datasets\vogel.csv")
df_vogel = df_vogel.drop(
    ['web-scraper-order', 'web-scraper-start-url', 'products', 'products-href', 'rating ', 'next', 'next-href',
     'review1'], axis=1)
df_vogel = df_vogel.dropna()
df_vogel['preprocessed_txt'] = df_vogel.review.apply(preprocess)
df_vogel.head()

Unnamed: 0,review,preprocessed_txt
1,"Prima artikel, echter systeem niet feilloos. W...",Prima artikel systeem feilloos wallplaar volle...
2,Het systeem is makkelijk op te hangen. Alle be...,systeem makkelijk afhangen benodighed zitten e...
3,"Perfecte kwaliteit, simpel met het monteren va...",perfect kwaliteit simpel monteren beugel tv mu...
4,"Ik heb dit vorige week gekocht, met het gedach...",vorig week kopen gedachte 65 inch tv afhangen ...
5,Een wall mount die supergemakkelijk te install...,wall mount supergemakkelijk installeren max 10...


In [36]:
rev_count = v.transform(df_vogel.preprocessed_txt)
model.predict(rev_count)

array([1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1], dtype=int64)

In [32]:
a = model.predict(rev_count)
a = pd.DataFrame(a)
print(a.value_counts())

1    38
0     7
dtype: int64


In [43]:
df_vogel.review.iloc[43]

'Ik heb twee maanden geleden gekocht. Omdat de beugel vrij prijzig toch even goedkopere proberen te zoeken, maar vaak middelmatige reviews. Dus toch maar weer voor Vogel gekozen en heb er absoluut geen spijt van. Alles klopt. Je krijgt een lading bouten voor in je tv, dus het kan eigenlijk niet misgaan. Neem geen genoegen met een B merk.'

In [44]:
df_vogel.preprocessed_txt.iloc[43]

'twee maand kopen beugel prijzig goedkoop proberen zoeken middelmatig review Vogel kiezen absoluut spijt kloppen krijgen lading bouten tv eigenlijk misgaan nemen genoegen B Merk'