# Preparation

In [None]:
%pip install nltk

In [None]:
import nltk
nltk.download('popular')
nltk.download('stopwords')
nltk.download('tagsets')

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Load Data

In [4]:
import pandas as pd

# Read data
# TODO: Change to base_data_path = 'INSERT_PATH'
base_data_path = './drive/MyDrive/Studium_Kempten/Masterarbeit/Azure_Notebooks/Abgabe_Data'
train_df = pd.read_csv(f'{base_data_path}/amazon_ffr_train.csv')
test_df = pd.read_csv(f'{base_data_path}/amazon_ffr_test.csv')

In [5]:
train_df

Unnamed: 0,Text,Sentiment
0,I did not like any of the brew over iced tea f...,2
1,My dog loves the nylabone bones and they aren'...,2
2,I bought this set of six cans for about $25--t...,2
3,If you have only tried Indian food in restaura...,0
4,I got the sea salt and vinegar chips from Kett...,2
...,...,...
397912,Kettle Chips Spicy Thai potato chips have the ...,2
397913,Matcha tea is wonderful. The seller was prompt...,2
397914,I gave it a 4 because my cat did not take to t...,2
397915,"When I first bought this product, I figured i...",2


In [6]:
test_df

Unnamed: 0,Text,Sentiment
0,I've bought Joyva tahini for 4 years and never...,0
1,Before ordering these I exchanged email with t...,0
2,There's not much product in the bag - just ove...,1
3,This is an excellent sugar substitute. It has...,2
4,I have purchased this before and am about to d...,2
...,...,...
9995,Love this tea! I enjoy it with a little bit o...,2
9996,After PetCo stopped making a similar product I...,2
9997,"Great product, but I ordered in August and shi...",2
9998,Picked the Rainforest blend up at a local big ...,0


# Preprocess Text for Classic ML

In [None]:
import re
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet

# Initialize stopwords
stop_words = set(stopwords.words('english'))

# For Cleaning Data: Compile regex
TAG_RE = re.compile(r'<[^>]+>')
PUNCT_NUM_RE = re.compile('[^a-zA-Z]')
SINGLE_CHAR_RE = re.compile(r'(?:\s|^)[a-zA-Z](?=\s|$)')
MULTI_SPACE_RE = re.compile(r'\s+')

# For POS-Tag Filter: define important tags
important_tags = [
    'FW', # foreign words
    'JJ', 'JJR', 'JJS', # adjectives
    'NN', 'NNP', 'NNS', 'NNPS', # nouns
    'RB', 'RBR', 'RBS', # adverbs
    'RP', # particles
    'UH', # interjections
    'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ' # verbs
]

# For Lemmatization: Map NLTK POS-TAG to WordNet POS-TAG Format
def map_pos_tag_nltk_to_wordnet(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        # Return noun by default because this is the default value for the lemmatize function in the WordNet Lemmatizer
        return wordnet.NOUN

# For Lemmatization: Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()

# Text-preprocessing All-in-One
def preprocess_text_classic_ml(sentence):
    # Clean Data
    ## Lowercasing
    sentence = sentence.lower()
    ## Remove HTML tags
    sentence = TAG_RE.sub('', sentence)
    ## Remove punctuations and numbers
    sentence = PUNCT_NUM_RE.sub(' ', sentence)
    ## Single character removal
    sentence = SINGLE_CHAR_RE.sub('', sentence)
    ## Remove multiple spaces
    sentence = MULTI_SPACE_RE.sub(' ', sentence).strip()

    # Tokenization
    tokens = nltk.word_tokenize(sentence)

    # Remove stopwords
    tokens_no_stopwords = [word for word in tokens if word not in stop_words]

    # POS-Tag Filter
    ## POS Tagging for all remaining tokens
    tokens_pos_tagged = nltk.pos_tag(tokens_no_stopwords)
    ## Filter by POS tags
    tokens_pos_filtered = [
        (word, pos_tag)
        for word, pos_tag in tokens_pos_tagged
        if pos_tag in important_tags
    ]

    # Lemmatization
    tokens_lemmatized = [
        lemmatizer.lemmatize(
            word, pos=map_pos_tag_nltk_to_wordnet(pos_tag)
        )
        for word, pos_tag in tokens_pos_filtered
    ]

    processed_sentence = ' '.join(tokens_lemmatized)
    return processed_sentence

In [None]:
# Apply Text-preprocessing
train_df['Text'] = [preprocess_text_classic_ml(review) for review in train_df['Text']]
test_df['Text'] = [preprocess_text_classic_ml(review) for review in test_df['Text']]

In [24]:
# TODO: Remove this, this is only temporary
from sklearn.model_selection import train_test_split
base_data_path = './drive/MyDrive/Studium_Kempten/Masterarbeit/Azure_Notebooks/data_for_export'
train_df = pd.read_csv(f'{base_data_path}/amazon_ffr_classic_ml_train_full_preprocessed.csv')
temp_df = pd.read_csv(f'{base_data_path}/amazon_ffr_classic_ml_test_full_preprocessed.csv')
test_df, generalization_df = train_test_split(temp_df, test_size=10000, train_size=10000, random_state=42)
test_df.reset_index(drop=True, inplace=True)

In [38]:
# Drop null values after preprocessing
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

In [41]:
X_train = train_df['Text']
y_train = train_df['Sentiment']
X_test = test_df['Text']
y_test = test_df['Sentiment']

In [39]:
train_df

Unnamed: 0,Text,Sentiment
0,brew ice tea flavor try love definately recomm...,2
1,dog love nylabone bone messy mind give,2
2,bought set can total idea get good buy see sel...,2
3,tried indian food restaurant something find ed...,0
4,get sea salt vinegar chip kettle used salt vin...,2
...,...,...
397912,kettle chip spicy thai potato chip perfect amo...,2
397913,matcha tea wonderful seller prompt many health...,2
397914,give cat take first almost think buy nothing p...,2
397915,first buy product figure taste anywhere whey p...,2


In [40]:
test_df

Unnamed: 0,Text,Sentiment
0,buy joyva tahini year never bad experience buy...,0
1,order exchange email manufacturer hickory harv...,0
2,much product bag handfull pay sale serve bit p...,1
3,excellent sugar substitute aftertaste spoon su...,2
4,purchase fast shipment chip fresh great seller,2
...,...,...
9995,love tea enjoy little bit honey add touch swee...,2
9996,petco stop make similar product delight find c...,2
9997,great product order august shipping take day f...,2
9998,pick rainforest blend local big box wholesale ...,0


# Numerical Representation of Text-Data

## TF-IDF

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Set a maximum of 1000 features per review
MAX_FEATURES = 1000
# All words which appear in less than 1% of the reviews are ignored
MIN_DF = 0.01
# All words which appear in more than 99% of the reviews are ignored
MAX_DF = 0.99

# Create the TF-IDF-Vectorizer with limits for the features
vectorizer_tfidf = TfidfVectorizer(max_features=MAX_FEATURES, min_df=MIN_DF, max_df=MAX_DF)

# Use fit_transform on the training data
X_train_tfidf = vectorizer_tfidf.fit_transform(X_train)
# transform the test data
X_test_tfidf = vectorizer_tfidf.transform(X_test)

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create the TF-IDF-Vectorizer WITHOUT any limits / parameters
vectorizer_tfidf_no_parameters = TfidfVectorizer()

# Use fit_transform on the training data
X_train_tfidf_no_parameters = vectorizer_tfidf_no_parameters.fit_transform(X_train)
# transform the test data
X_test_tfidf_no_parameters = vectorizer_tfidf_no_parameters.transform(X_test)

In [46]:
print(f'Shape of train-data WITH limited size (parameters) {X_train_tfidf.shape}')
print(f'Shape of train-data NO limited size (parameters) {X_train_tfidf_no_parameters.shape}')

Shape of train-data WITH limited size (parameters) (397912, 585)
Shape of train-data NO limited size (parameters) (397912, 96885)


In [50]:
# --> Each Review is now represented as one TF-IDF Vector
# Example Vector:
pd.DataFrame(X_train_tfidf[0].T.todense(), index=vectorizer_tfidf.get_feature_names_out(), columns=["TF-IDF"]).sort_values(by=["TF-IDF"], ascending=False)

Unnamed: 0,TF-IDF
tea,0.577797
ice,0.417843
brew,0.394983
hot,0.345322
recommend,0.288788
...,...
far,0.000000
fast,0.000000
fat,0.000000
favorite,0.000000


## Word2Vec

In [None]:
from gensim.models import Word2Vec
import numpy as np

# The Doc2Vec and Word2Vec-Model need tokenized data.
# the input data is already preprocessed, so using str(x).split() is sufficient for tokenization here
X_train_tokens = X_train.apply(lambda x: str(x).split())
X_test_tokens = X_test.apply(lambda x: str(x).split())

# window = int - The maximum distance between the current and predicted word within a sentence. E.g. X words on the left and X words on the right of our target
# min_count = int - Ignores all words with total absolute frequency lower than this
# vector_size = int - Dimensionality of the feature vectors.
# workers = int - Use these many worker threads to train the model
# epochs : int - Number of iterations (epochs) over the corpus.

# Initialize Word2Vec Model based on the train data
w2v_model = Word2Vec(X_train_tokens, window=5, min_count=2, vector_size=100, workers=4, epochs=10)

In [None]:
print(f'Size of vocabulary: {len(w2v_model.wv.index_to_key)}')
print(f'Top 15 Words: {w2v_model.wv.index_to_key[:15]}')

Size of vocabulary: 52596
Top 15 Words: ['taste', 'good', 'flavor', 'get', 'product', 'love', 'coffee', 'make', 'great', 'use', 'food', 'try', 'tea', 'buy', 'find']


In [None]:
def text_to_word2vec_vector(review, w2v_model):
    words_vecs = [w2v_model.wv[word] for word in review if word in w2v_model.wv]
    # Handle a review full of unknown words
    if len(words_vecs) == 0:
        return np.zeros(100)
    words_vecs = np.array(words_vecs)
    # Word2Vec would create one 100-dimensional vector FOR EACH WORD
    # We use the average over the whole document (review)
    return np.mean(words_vecs, axis=0)

In [None]:
# Apply vectorization to train & test data
X_train_word2vec = np.array([text_to_word2vec_vector(tokens, w2v_model) for tokens in X_train_tokens])
X_test_word2vec = np.array([text_to_word2vec_vector(tokens, w2v_model) for tokens in X_test_tokens])

## Doc2Vec

In [None]:
from gensim.models.doc2vec import TaggedDocument

# Tagged train data is needed to train the Doc2Vec Model
# Test-Data does not need to be tagged
tagged_train_data = [TaggedDocument(words=words, tags=[f'train_{i}']) for i, words in enumerate(X_train_tokens)]

In [None]:
print(tagged_train_data[:2])

[TaggedDocument(words=['brew', 'ice', 'tea', 'flavor', 'try', 'love', 'definately', 'recommend', 'iced', 'hot', 'tea'], tags=['train_0']), TaggedDocument(words=['dog', 'love', 'nylabone', 'bone', 'messy', 'mind', 'give'], tags=['train_1'])]


In [None]:
from gensim.models.doc2vec import Doc2Vec

# Define Doc2Vec model, build vocabulary, train the model
doc2vec_model = Doc2Vec(vector_size=100, window=5, min_count=2, workers=1, epochs=10, seed=42)
doc2vec_model.build_vocab(tagged_train_data)
doc2vec_model.train(tagged_train_data, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

In [None]:
vocab_size = len(doc2vec_model.wv.key_to_index)
print(f"Size of vocabulary: {vocab_size}")

Size of vocabulary: 52596


In [None]:
# Apply vectorization to train & test data
X_train_doc2vec = np.array([doc2vec_model.dv[f'train_{i}'] for i in range(len(X_train_tokens))])
X_test_doc2vec = np.array([doc2vec_model.infer_vector(doc) for doc in X_test_tokens])

In [None]:
# Converting the transformed Training- and Testdata into DataFrames
train_df_transformed = pd.DataFrame(X_train_doc2vec, columns=[f"Doc2Vec_{i}" for i in range(100)])
test_df_transformed = pd.DataFrame(X_test_doc2vec, columns=[f"Doc2Vec_{i}" for i in range(100)])

In [None]:
label_column='Sentiment'
# Adding the Labels to the DataFrames
train_df_transformed[label_column] = y_train.reset_index(drop=True)
test_df_transformed[label_column] = y_test.reset_index(drop=True)

In [None]:
train_df_transformed

Unnamed: 0,Doc2Vec_0,Doc2Vec_1,Doc2Vec_2,Doc2Vec_3,Doc2Vec_4,Doc2Vec_5,Doc2Vec_6,Doc2Vec_7,Doc2Vec_8,Doc2Vec_9,...,Doc2Vec_91,Doc2Vec_92,Doc2Vec_93,Doc2Vec_94,Doc2Vec_95,Doc2Vec_96,Doc2Vec_97,Doc2Vec_98,Doc2Vec_99,Sentiment
0,-0.082163,-0.081199,0.026726,-0.025701,0.048737,-0.002387,-0.045283,0.055585,-0.009186,0.113985,...,-0.012686,0.082674,-0.009653,0.118468,0.092455,0.042676,-0.042164,0.008726,0.026581,2
1,-0.047760,-0.014450,0.009842,-0.031044,0.101228,-0.058513,0.006752,0.009453,-0.067625,-0.006022,...,0.088210,0.008454,-0.024645,0.101673,-0.010143,0.051384,-0.009700,-0.034938,0.042613,2
2,-0.088413,0.052394,-0.006616,-0.071001,0.128266,-0.022012,0.093738,0.090435,0.257044,0.072351,...,0.089578,0.097506,0.095228,0.155990,-0.060308,-0.149187,-0.021136,-0.351205,-0.279734,2
3,-0.026379,-0.068179,0.061104,-0.120204,-0.024576,-0.068373,-0.072497,0.182345,-0.068649,0.220170,...,-0.099159,0.191937,0.073729,0.130841,-0.049849,0.073395,-0.096735,0.046754,0.036784,0
4,-0.468424,-0.281337,0.138786,-0.123415,0.344630,-0.173613,-0.057691,-0.257429,-0.155804,0.152811,...,-0.108153,0.275459,0.077906,0.224223,0.168732,-0.235507,0.173782,-0.029449,0.273709,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
397912,-0.199525,-0.064818,0.000078,-0.153181,0.205350,-0.086645,-0.054996,0.177268,-0.071626,0.097895,...,0.061720,0.282481,-0.033211,0.341646,0.237285,0.062051,0.031497,-0.136233,0.183975,2
397913,-0.090363,0.005330,-0.029674,-0.044821,0.103264,0.033271,-0.008328,0.012920,-0.048687,0.034772,...,-0.125066,0.124320,-0.017458,0.154479,0.097180,0.155187,-0.104426,0.051666,0.042357,2
397914,0.000491,-0.089054,0.107708,-0.065953,0.156555,-0.151014,-0.028635,0.083541,0.079402,0.139187,...,0.149233,0.041410,0.112241,0.270263,0.104267,-0.002859,0.017670,-0.118680,-0.130077,2
397915,-0.007528,-0.119786,-0.090412,-0.301174,-0.279661,0.088210,-0.001229,0.171214,0.090999,0.298241,...,0.027106,-0.160280,0.000374,0.039213,0.168857,-0.122471,0.106530,0.397100,0.163284,2


In [None]:
# Save the trained Doc2Vec Model
doc2vec_model.save('./models/doc2vec_model/trained_doc2vec_model')

In [None]:
train_df_transformed.to_csv(f'{base_data_path}/amazon_ffr_train_doc2vec.csv', index=False)
test_df_transformed.to_csv(f'{base_data_path}/amazon_ffr_test_doc2vec.csv', index=False)

# Training of Classic ML Models

In [None]:
# Create Dictionary in which all trained models will be saved
models_dict = {}

## SVM / SVC

In [None]:
from sklearn.svm import SVC

model_params = {
    "random_state": 11
}

model_svc=SVC(**model_params)
models_dict['SVC']=model_svc.fit(X_train, y_train)

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

model_params = {
    "penalty": "l2",
    "C": 1.0,
    "solver": "lbfgs",
    "random_state": 11
}

model_logistic_regression=LogisticRegression(**model_params)
models_dict['Logistic_Regression']= model_logistic_regression.fit(X_train_doc2vec, y_train)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

model_params = {
    "n_estimators": 100,
    "max_depth": None,
    "min_samples_split": 2,
    "bootstrap": True,
    "random_state": 11
}

model_random_forest=RandomForestClassifier(**model_params)
models_dict['Random_Forest']=model_random_forest.fit(X_train_doc2vec, y_train)

## XGBoost

In [None]:
from xgboost import XGBClassifier

model_params = {
    "learning_rate": 0.1,
    "n_estimators": 100,
    "booster": "gbtree",
    "random_state": 11,
    "use_label_encoder": False
}

model_xgboost=XGBClassifier(**model_params)
models_dict['XGBoost']=model_xgboost.fit(X_train_doc2vec, y_train)

# Evaluation of Classic ML Models

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
for model_name, model in models_dict.items():
  y_pred = model.predict(X_test_doc2vec)
  results = classification_report(y_test, y_pred, output_dict=True)
  cm = confusion_matrix(y_test, y_pred)
  # You can save or use the predictions and the results here...