## TFIDF Vectorisation

In [None]:
import pandas as pd
df = pd.read_csv("data/TheSocialDilemma.csv")
df["label"] = df["Sentiment"].apply(lambda input:
                "positive" if input == "Positive" else "notpositive"
            )
df = df[["text", "label"]]
df = df.sample(100)
df.head()

### Clean-Up to Reduce Vector Dimensionality

In [None]:
import spacy
nlp = spacy.load("en_core_web_lg")

def clean(text):
    text = text.lower()
    document = nlp(text)
    result = []
    for token in document:
        if not token.is_stop and (token.pos_ == "NOUN" or token.pos_ == "VERB"):
            result.append(token.lemma_)

    return " ".join(map(str, result))

df["text_clean"] = df["text"].apply(lambda input: clean(input))
df.head()


### TFID Vectors & Features

Observe the difference in the number of features depending on vectorising texts in column "text" vs. "text_clean"

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectoriser = TfidfVectorizer()
vectoriser.fit(df["text_clean"])
for term in vectoriser.get_feature_names_out():
    print(term)