Build a sentiment classification model for 50k IMDB movie reviews dataset using only simple statistical techniques.

Evaluate its performance and identify what features work and what don't.

Reference notebooks to learn from:
- https://www.kaggle.com/code/majinx/nlp-imdb-reviews-prediction-multiple-models
- https://www.kaggle.com/code/yasserh/imdb-movie-rating-sentiment-analysis
- https://www.kaggle.com/code/dmid2qwde/imdb-best-accuracy-0-89

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re, string
from collections import Counter
from wordcloud import WordCloud
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize as nltk_tokenizer
from nltk.corpus import stopwords
from nltk import pos_tag as nltk_pos_tagger
from nltk.stem import PorterStemmer
import random, time
from scipy import sparse
from tqdm.notebook import tqdm

In [2]:
full_data_pdf = pd.read_csv('../../local/data/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')

# split into train and test data. avoid looking at the test data
train_pdf = full_data_pdf.sample(frac=0.8, random_state=0)
test_pdf = full_data_pdf.drop(train_pdf.index)

Data Exploration

In [None]:
"""
Observations:
- there are some html tags. Mostly <br /> tags
- sometimes the review has rating inside it like 7/10, 10/10, RATING: 10 of 10. That can be useful signal
"""

train_pdf.info()

# see sample reviews
with pd.option_context('display.max_colwidth', None):
    display(train_pdf.sample(10))

In [None]:
# try to find simple discriminatory features
# plot num words and num chars distribution by class. May be one class has longer reviews
train_pdf['num_words'] = train_pdf['review'].apply(lambda x: len(x.split()))
train_pdf['num_chars'] = train_pdf['review'].apply(lambda x: len(x))
sns.kdeplot(data=train_pdf, x='num_words', hue='sentiment', common_norm=False)
plt.show()
sns.kdeplot(data=train_pdf, x='num_chars', hue='sentiment', common_norm=False)
plt.show()
# both classes have similar distribution. So, length of review is not a good discriminator

# regex that matches html tags like <.*>. What all tags like this are there? Do they help discriminate?
html_tag_re = re.compile(r'<.*?>')
html_tags = Counter()
for review in train_pdf['review']:
    html_tags.update(html_tag_re.findall(review))
print(html_tags)
# only <br /> tag has significant counts. Others can be ignored or removed. In any case, they have very low counts
# does <br /> only appear in pairs? Like <br /><br />

br_tag_pairs_counts = sum([review.count('<br /><br />') for review in train_pdf['review']])
print (2 * br_tag_pairs_counts - html_tags['<br />']) # yes. pretty much

# does the presence of <br /> tag help discriminate?
train_pdf['num_br_tags'] = train_pdf['review'].apply(lambda x: x.count('<br />'))
sns.countplot(data=train_pdf, x='num_br_tags', hue='sentiment')
plt.show() # not at all. Both classes have similar distribution

# sometimes reviews have websites inside them. Show counts of websites
website_re = re.compile('https?://\S+|www\.\S+')
websites = Counter()
for review in train_pdf['review']:
    websites.update(website_re.findall(review))
print(websites)
print(sum(websites.values())) # very low counts. Can be ignored

# may be capitalization amount can help discriminate
train_pdf['num_upper'] = train_pdf['review'].apply(lambda x: sum(1 for c in x if c.isupper()))
sns.kdeplot(data=train_pdf, x='num_upper', hue='sentiment', common_norm=False)
plt.show() # negative reviews have more slightly more capitalization but not much

# try to extract ratings from reviews like 7/10, 2.5/4, RATING: 10 of 10, 5 out of 5
def extract_ratings(review):
    rating_re = re.compile(r'(RATING:?\s*)?\b(-?\d+(\.\d+)?)\s*(/|(out)\s*(of))\s*\d+(\.\d+)?(\s|\.)', re.IGNORECASE)
    match = rating_re.search(review)
    ratings = []
    while match:
        ratings.append(review[match.start():match.end()])
        review = review[:match.start()] + review[match.end():]
        match = rating_re.search(review)
    return ratings

train_pdf['ratings'] = train_pdf['review'].apply(extract_ratings)
rating_counts = Counter()
for ratings in train_pdf['ratings']:
    rating_counts.update(ratings)
print (f"Total ratings: {sum(rating_counts.values())}\n\n")
print(rating_counts)

# plot ratings distribution by class
train_pdf['num_ratings'] = train_pdf['ratings'].apply(len)
# show bar plot of num ratings distribution by class
sns.countplot(data=train_pdf, x='num_ratings', hue='sentiment')
plt.show() # weird! Why is even this balanced between the two classes :D? Was the data synthetically generated?

# show word clouds of ratings by class
# positive reviews rating word cloud
positive_ratings = [rating for rating in train_pdf[train_pdf['sentiment'] == 'positive']['ratings']]
positive_ratings = [rating.replace(' ', '_').replace(':', '') for ratings in positive_ratings for rating in ratings]
positive_ratings = ' '.join(positive_ratings)
wordcloud = WordCloud(width=800, height=400).generate(positive_ratings)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

# negative reviews rating word cloud
negative_ratings = [rating for rating in train_pdf[train_pdf['sentiment'] == 'negative']['ratings']]
negative_ratings = [rating.replace(' ', '_').replace(':', '') for ratings in negative_ratings for rating in ratings]
negative_ratings = ' '.join(negative_ratings)
wordcloud = WordCloud(width=800, height=400).generate(negative_ratings)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

# alright! ratings are present in some reviews. They are useful discriminators but the count is pretty low


Develop feature computation pipeline

In [None]:
"""
Pipeline structure:

- text preprocessing
    - remove html tags
    - remove websites
    - tunable
        - extract ratings and inject as new synthetic normalized words inside the review
        - try negation handling like NOT_good, NOT_boring, etc. as described in the NLP book
        - tag with POS tags
        - lowercasing
        - remove punctuations and special characters
        - remove stopwords
        - lemmatization
- feature computation
    - tokenizer: r"(?u)\b\w\w+\b" (default)
    - count vectorizer
        - tune ngram range: (1, 1), (1, 2), (1, 3)
        - set binary to true or false
        - set min_df to values like 1, 2, 5
        - set max_features to values like 1000, 5000, 10000
    - tfidf transformer
        - skip completely
        - try different norms: l1, l2, none
        - set use_idf to true or false
        - set smooth_idf to true or false
        - set sublinear tf to true or false
    - lexicon features
        - raw counts of ratings in Afinn, Bing, NRC lexicons
            - unique words in the lexicon that are present in the review
        - normalize
            - divide by the number of words in the review
- model training
    - LogisticRegression
        - tweak hyperparameters
        - know if its underfitting or overfitting
    - MultinomialNB
        - ensure each feature is a count
        - tweak alpha (smoothing parameter)
    - BernoulliNB
        - ensure each feature is binary
        - tweak alpha (smoothing parameter)
    - RandomForest
        - tweak hyperparameters
    - GradientBoosting
        - tweak hyperparameters
- model evaluation
    - metrics:
        - accuracy, precision, recall, F1-score
        - confusion matrix
        - ROC-AUC
    - compare metrics between training and dev set to detect overfitting
    - show feature importance
- model selection
    - tweak each model type separately
        - try different preprocessing and hyperparameters that work best for that model
"""
_=1

In [3]:
html_tag_re = re.compile(r'<.*?>')
website_re = re.compile('https?://\S+|www\.\S+')
rating_re = re.compile(r'(RATING:?\s*)?\b(-?\d+(\.\d+)?)\s*(/|((out)?\s*(of)))\s*\d+(\.\d+)?[^/]', re.IGNORECASE)
nltk_porter_stemmer = PorterStemmer()
nltk_stop_word_list = set(stopwords.words('english'))

def extract_ratings(review):
    match = rating_re.search(review)
    ratings = []
    while match:
        ratings.append(review[match.start():match.end()])
        review = review[:match.start()] + review[match.end():]
        match = rating_re.search(review)
    return ratings, review

def handle_negation(text):
    # Define negation tokens and punctuation marks
    negation_tokens = ["n't", "not", "no", "never"]
    punctuation_marks = ['.', ',', ';', ':', '!', '?']
    
    # Tokenize the text
    tokens = nltk_tokenizer(text)
    
    # Initialize variables
    negation = False
    result_tokens = []
    
    for token in tokens:
        # If the token is a punctuation mark, reset negation flag
        if token in punctuation_marks:
            negation = False
        
        # If negation is active, prepend "NOT_" to the token
        if negation:
            result_tokens.append("NOT_" + token)
        else:
            result_tokens.append(token)
        
        # If the token is a negation token, activate negation flag
        if token.lower().split('_')[0] in negation_tokens:
            negation = True
    
    # Join the tokens back into a string
    result_text = ' '.join(result_tokens)
    return result_text

In [4]:
afinn = pd.read_csv('../../local/data/lexicons/sentiment_analysis/Afinn.csv', encoding='ISO-8859-1')
afinn['source'] = 'Afinn'
bing = pd.read_csv('../../local/data/lexicons/sentiment_analysis/Bing.csv')\
        .rename(columns={'sentiment': 'value'})
bing['source'] = 'Bing'
nrc = pd.read_csv('../../local/data/lexicons/sentiment_analysis/NRC.csv')\
    .rename(columns={'sentiment': 'value'})
nrc['source'] = 'NRC'
sentiment_lexicons = pd.concat([afinn, bing, nrc], ignore_index=True)
sentiment_lexicons['word'] = sentiment_lexicons['word'].str.lower()
sentiment_lexicons['label'] = sentiment_lexicons['source'] \
    + '_' + (sentiment_lexicons['value']).astype(str)
sentiment_lexicons = sentiment_lexicons[['word', 'label']].drop_duplicates()
label_to_words = sentiment_lexicons.groupby('label')['word'].apply(set).to_dict()

def compute_lexicon_feature(text, label, normalize=False):
    text_words = set(text.lower().split())
    label_words = label_to_words[label]
    common_words = len(text_words.intersection(label_words))
    if normalize:
        return common_words / len(text_words)
    return common_words

In [5]:
def preprocess_text(text, config):
    # remove html tags
    text = text.replace('<br /><br />', ' ')
    text = html_tag_re.sub('', text)

    # remove websites
    text = website_re.sub('', text)

    ratings = []

    # extract ratings
    extract_ratings_flag = config.get('text_preprocessing', {}).get('extract_ratings', False)
    if extract_ratings_flag:
        ratings, text = extract_ratings(text)

    # add POS tags
    pos_tagging_flag = config.get('text_preprocessing', {}).get('pos_tagging', False)
    if pos_tagging_flag:
        tagged_tokens = nltk_pos_tagger(nltk_tokenizer(text))
        text = ' '.join([token + ("_"+tag if tag!=token else '') for token, tag in tagged_tokens])

    # handle negation
    negation_handling_flag = config.get('text_preprocessing', {}).get('negation_handling', False)
    if negation_handling_flag:
        text = handle_negation(text)

    # lower casing
    lowercasing_flag = config.get('text_preprocessing', {}).get('lowercasing', False)
    if lowercasing_flag:
        text = text.lower()

    # remove punctuations and special characters
    remove_puncs_n_sp_ch_flag = config.get('text_preprocessing', {}).get('remove_puncs_n_sp_ch', False)
    if remove_puncs_n_sp_ch_flag:
        punctuation_without_underscore = string.punctuation.replace('_', '')
        text = text.translate(str.maketrans('', '', punctuation_without_underscore))

    # remove stopwords
    remove_stopwords_flag = config.get('text_preprocessing', {}).get('remove_stopwords', False)

    # lemmitization
    stemming_flag = config.get('text_preprocessing', {}).get('stemming', False)

    if remove_stopwords_flag or stemming_flag:
        tokens = nltk_tokenizer(text)
        new_tokens = []
        for token in tokens:
            is_not = token.startswith('not_')
            token = token.split('not_')[-1]
            pos_decoration = ''
            if '_' in token:
                pos_decoration = token.split('_')[-1]
                token = token.split('_')[0]
            if remove_stopwords_flag and token in nltk_stop_word_list: continue
            if stemming_flag:
                token = nltk_porter_stemmer.stem(token)
            new_tokens.append(f"{'not_' if is_not else ''}{token}{'_' + pos_decoration if pos_decoration else ''}")
            
        # tokens = [lemmatizer.lemmatize(token) for token in tokens if token.split('not_')[-1].split('_')[0] not in stop_words]
        text = ' '.join(new_tokens)

    if extract_ratings_flag:
        # inject ratings as synthetic words
        for rating in ratings:
            rating = rating.strip().strip('.')
            # replace spaces and tabs with underscores
            rating = rating.replace(' ', '_')
            text += f" RATING_START_{rating}_END"

    return text

In [6]:
def fit_features(train_pdf, pipeline_config):
    train_pdf = train_pdf.copy()
    trained_assets = {}
    train_pdf['word_count'] = train_pdf['processed_review'].apply(lambda x: len(x.split()))
    train_pdf['char_count'] = train_pdf['processed_review'].apply(lambda x: len(x))
    for label in label_to_words.keys():
        train_pdf[label] = train_pdf['processed_review']\
            .apply(lambda x: compute_lexicon_feature(x, label, normalize=True))
    for cname in ['word_count', 'char_count'] + list(label_to_words.keys()):
        trained_assets[cname+"_range"] = (train_pdf[cname].min(), train_pdf[cname].max())
    if 'count_vectorizer' in pipeline_config['feature_computation']:
        cv_config = pipeline_config['feature_computation']['count_vectorizer']
        tokenizer = pipeline_config['feature_computation'].get('tokenizer', None)
        if tokenizer is None:
            pass
        elif tokenizer == 'nltk':
            tokenizer = nltk_tokenizer
        elif tokenizer == 'split':
            tokenizer = lambda x: x.split()
        else: raise ValueError(f"Unknown tokenizer: {tokenizer}")
        count_vectorizer = CountVectorizer(tokenizer=tokenizer,
                                            lowercase=False,
                                            ngram_range=cv_config.get('ngram_range', (1, 1)),
                                            binary=cv_config.get('binary', False),
                                            max_df=cv_config.get('max_df', 1.0),
                                            min_df=cv_config.get('min_df', 1),
                                            max_features=cv_config.get('max_features', None))
        count_vectorizer.fit(train_pdf['processed_review'])
        tfidf_config = cv_config.get('tfidf_transformer', {})
        tfidf_transformer = TfidfTransformer(use_idf=tfidf_config.get('use_idf', True),
                                            smooth_idf=tfidf_config.get('smooth_idf', True),
                                            sublinear_tf=tfidf_config.get('sublinear_tf', False),
                                            norm=tfidf_config.get('norm', 'l2'))
        tfidf_transformer.fit(count_vectorizer.transform(train_pdf['processed_review']))
        trained_assets['count_vectorizer'] = count_vectorizer
        trained_assets['tfidf_transformer'] = tfidf_transformer
    return trained_assets

In [7]:
def compute_features(data_pdf, trained_assets, pipeline_config):
    for feature in pipeline_config['feature_computation'].get('custom_features', []):
        if feature == 'word_count':
            data_pdf['feat_word_count'] = data_pdf['processed_review'].apply(lambda x: len(x.split()))
        elif feature == 'word_count_minmax_scaled':
            min_val, max_val = trained_assets['word_count_range']
            data_pdf['feat_word_count_minmax_scaled'] = data_pdf['processed_review']\
                .apply(lambda x: (min(len(x.split()), max_val) - min_val) / (max_val - min_val + 1e-6))
        elif feature == 'char_count':
            data_pdf['feat_char_count'] = data_pdf['processed_review'].apply(lambda x: len(x))
        elif feature == 'char_count_minmax_scaled':
            min_val, max_val = trained_assets['char_count_range']
            data_pdf['feat_char_count_minmax_scaled'] = data_pdf['processed_review']\
                .apply(lambda x: (min(len(x), max_val) - min_val) / (max_val - min_val + 1e-6))
        elif feature == 'lexicon_features':
            for label in label_to_words.keys():
                data_pdf[f'feat_{label}'] = data_pdf['unprocessed_review']\
                    .apply(lambda x: compute_lexicon_feature(x, label, normalize=False))
        elif feature == 'lexicon_features_minmax_scaled':
            for label in label_to_words.keys():
                min_val, max_val = trained_assets[label+'_range']
                data_pdf[f'feat_{label}_minmax_scaled'] = data_pdf['unprocessed_review']\
                    .apply(lambda x: (min(compute_lexicon_feature(x, label, normalize=False), max_val) - min_val) / (max_val - min_val + 1e-6))
        elif feature == 'lexicon_features_normalized':
            for label in label_to_words.keys():
                data_pdf[f'feat_{label}_normalized'] = data_pdf['unprocessed_review']\
                    .apply(lambda x: compute_lexicon_feature(x, label, normalize=True))
        else:
            raise ValueError(f"Unknown feature: {feature}") 
    
    additional_features = list(filter(lambda x: x.startswith('feat_'), data_pdf.columns))
    if 'count_vectorizer' in pipeline_config['feature_computation']:
        count_vectorizer = trained_assets['count_vectorizer']
        tfidf_transformer = trained_assets['tfidf_transformer']
        X = count_vectorizer.transform(data_pdf['processed_review'])
        X = tfidf_transformer.transform(X)
        if len(additional_features) > 0:
            X_additional_features = data_pdf[additional_features].values
            X = sparse.hstack([X, X_additional_features], format='csr')
        feat_id_to_name = {i: "CountFeat_"+name 
                           for i, name in enumerate(count_vectorizer.get_feature_names_out())}
        feat_id_to_name.update({i+len(feat_id_to_name): name for i, name in enumerate(additional_features)})
        return X, feat_id_to_name
    else:
        return sparse.csr_matrix(data_pdf[additional_features].values), \
            {i: name for i, name in enumerate(additional_features)}
    

In [8]:
pipeline_config = {
    'text_preprocessing': {
        'extract_ratings': True,
        'pos_tagging': True,
        'negation_handling': True,
        'remove_puncs_n_sp_ch': True,
        'remove_stopwords': True,
        'stemming': True,
        'lowercasing': True,
    },
    'feature_computation': {
        'tokenizer': 'split',
        'custom_features': ['word_count', 'char_count', 
                            'word_count_minmax_scaled', 'char_count_minmax_scaled',
                            'lexicon_features', 
                            'lexicon_features_minmax_scaled',
                            'lexicon_features_normalized'],
        'count_vectorizer': {
            'ngram_range': (1, 3),
            'binary': False,
            'min_df': 2,
            'max_df': 1.0,
            'max_features': None,
            'tfidf_transformer': {
                'use_idf': True,
                'smooth_idf': True,
                'sublinear_tf': False,
                'norm': 'l2'
            },
        }
    }
}

In [9]:
data = train_pdf[['review', 'sentiment']].copy()\
    .rename(columns={'review': 'unprocessed_review'})
train_data = data.sample(frac=0.8, random_state=0)
dev_data = data.drop(train_data.index)
train_data = train_data.reset_index(drop=True)
dev_data = dev_data.reset_index(drop=True)

In [10]:
reviews_to_process = []
for index, row in train_data.iterrows():
    reviews_to_process.append((index, row['unprocessed_review']))

In [12]:
from concurrent.futures import ProcessPoolExecutor

def process_review(review_tuple):
    index, review_text = review_tuple
    processed_text = preprocess_text(review_text, pipeline_config)
    return (index, processed_text)

# Preparing the data (your existing code)
reviews_to_process = []
for index, row in train_data.iterrows():
    reviews_to_process.append((index, row['unprocessed_review']))

# Processing in parallel
with ProcessPoolExecutor(max_workers=10) as executor:
    results = list(executor.map(process_review, reviews_to_process[:1000]))

# 'results' now contains the processed reviews

In [None]:

train_data['processed_review'] = train_data['review']\
    .apply(lambda x: preprocess_text(x, pipeline_config))

In [None]:
trained_assets = fit_features(train_data, pipeline_config)

In [None]:
X_train, feat_id_to_name = compute_features(train_data, trained_assets, pipeline_config)

In [None]:
print (X_train.shape)
print (X_train.head())
feat_id_to_name

In [None]:
train_data['processed_review']

In [None]:
cv_config = {}
cv = CountVectorizer(tokenizer=lambda x: x.split(),
                                            lowercase=False,
                                            ngram_range=cv_config.get('ngram_range', (1, 3)),
                                            binary=cv_config.get('binary', False),
                                            max_df=cv_config.get('max_df', 1.0),
                                            min_df=cv_config.get('min_df', 1),
                                            max_features=cv_config.get('max_features', None))

start_time = time.time()
cv.fit(train_data['processed_review'])
print (f"Time taken: {time.time() - start_time}")

In [None]:
len(train_data['processed_review'])

In [None]:
len(cv.get_feature_names_out())

In [None]:
tfidf_config = cv_config.get('tfidf_transformer', {})
tfidf_transformer = TfidfTransformer(use_idf=tfidf_config.get('use_idf', True),
                                    smooth_idf=tfidf_config.get('smooth_idf', True),
                                    sublinear_tf=tfidf_config.get('sublinear_tf', False),
                                    norm=tfidf_config.get('norm', 'l2'))

In [None]:
t = cv.transform(train_data['processed_review'])

In [None]:
tfidf_transformer.fit(t)

In [None]:
t = tfidf_transformer.transform(t)

In [None]:
print (t[:3])

In [None]:
# dry run the pipeline on a sample data to see if its works as expected

In [None]:
text = "www.youtube.com " + train_pdf['review'].iloc[45] + " RATING: 10 of 10"

print (text, '\n\n')

preprocessed_text = preprocess_text(text, pipeline_config)
print (preprocessed_text)

In [None]:
toy_dataset = []
tokens = preprocessed_text.split()
random.seed(0)
chunk_idx = 0
while chunk_idx < len(tokens):
    chunk_size = random.randint(3, 10)
    chunk = ' '.join(tokens[chunk_idx:chunk_idx+chunk_size])
    toy_dataset.append({
        'unprocessed_review': ' '.join(text.split()[chunk_idx:chunk_idx+chunk_size]),
        'processed_review': chunk,
        'sentiment': 'positive' if random.random() > 0.5 else 'negative'
    })
    chunk_idx += chunk_size
pdf = pd.DataFrame(toy_dataset)[:20]

train = pdf.sample(frac=0.8, random_state=0)
test = pdf.drop(train.index)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

train

In [None]:
trained_assets = fit_features(train, pipeline_config)
X, feat_id_to_name = compute_features(train, trained_assets, pipeline_config)
print (X)
print (feat_id_to_name)

In [None]:


y = (train['sentiment'] == 'positive').astype(int)

with pd.option_context('display.max_colwidth', None):
    display(train.reset_index(drop=True))

print (X.shape, y.shape)
for i in range(6):
    print (i)
    print (X[i])

In [None]:
cv = CountVectorizer(tokenizer=nltk_tokenizer, ngram_range=(1, 1), binary=False, min_df=1, max_features=None)

In [None]:
nltk_wordnet_lemmatizer.lemmatize('running')

In [None]:
nltk.download('averaged_perceptron_tagger')

In [None]:
text = train_pdf['review'].iloc[0][:200]
print (text, '\n')
tokens = nltk.word_tokenize(text)
print (tokens, '\n')
tags = nltk.pos_tag(tokens)
print(tags, '\n\n')

pattern = r"(?u)\b\w\w+\b"
tokens = re.findall(pattern, text)
print (tokens, '\n')
tags = nltk.pos_tag(tokens)
print(tags)

In [None]:
def clean_text(text):
    text=str(text).lower()
    text=re.sub('\[.*?\]', '', text)
    text=re.sub('https?://\S+|www\.\S+', '', text)
    text=re.sub('<.*?>+', '', text)
    text=re.sub('[%s]' % re.escape(string.punctuation),'',text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
sws=stopwords.words('english')
lemma=WordNetLemmatizer()

def preprocessdata(text):
    text= ' '.join(word for word in text.split(' ') if word not in sws)
    text= ' '.join(lemma.lemmatize(word) for word in text.split(' '))
    return text

In [None]:
pdf = train_pdf.sample(10).copy()
pdf

In [None]:
for index, row in pdf.iterrows():
    print (index, end="\n")
    print (row['review'])
    print ("\n\n--------------\n\n")
    print (preprocessdata(clean_text(row['review'])))
    print ("\n\n===================\n\n")

In [None]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [None]:
s = ['aa aa bb bb Cc cc cc dd', 'cc cc dd dd dd dd ee']
cv = CountVectorizer(lowercase=False, ngram_range=(1, 3), max_features=6)
X = cv.fit_transform(s)
print(cv.get_feature_names_out())
print (X)
print(X.toarray())

tfidf = TfidfTransformer(use_idf=False, norm=None)
Y = tfidf.fit_transform(X)
# print (tfidf.idf_)
print (Y)
print (Y.toarray())

In [None]:
pipe=Pipeline([('bow',CountVectorizer()),('tfidf',TfidfTransformer()),('model',model)])
pipe.fit(X_train,y_train)
y_pred=pipe.predict(X_test)
print('Accuracy Score: ',accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))
plot_confusion_matrix(pipe,X_test,y_test)