In [27]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()
v.fit(["Thor Hathodawala is looking for a job"])
v.vocabulary_

{'thor': 5, 'hathodawala': 1, 'is': 2, 'looking': 4, 'for': 0, 'job': 3}

In [28]:
v = CountVectorizer(ngram_range=(1,2))
v.fit(["Thor Hathodawala is looking for a job"])
v.vocabulary_

{'thor': 9,
 'hathodawala': 2,
 'is': 4,
 'looking': 7,
 'for': 0,
 'job': 6,
 'thor hathodawala': 10,
 'hathodawala is': 3,
 'is looking': 5,
 'looking for': 8,
 'for job': 1}

In [29]:
v = CountVectorizer(ngram_range=(1,3))
v.fit(["Thor Hathodawala is looking for a job"])
v.vocabulary_

{'thor': 12,
 'hathodawala': 2,
 'is': 5,
 'looking': 9,
 'for': 0,
 'job': 8,
 'thor hathodawala': 13,
 'hathodawala is': 3,
 'is looking': 6,
 'looking for': 10,
 'for job': 1,
 'thor hathodawala is': 14,
 'hathodawala is looking': 4,
 'is looking for': 7,
 'looking for job': 11}

In [30]:
corpus = [
    "Thor ate pizza",
    "Loki is tall",
    "Loki is eating pizza"
]

In [31]:
import spacy

# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm") 

def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

In [32]:
preprocess("Thor ate pizza")

'thor eat pizza'

In [33]:
preprocess("Loki is eating pizza")

'Loki eat pizza'

In [34]:
corpus_processed = [
    preprocess(text) for text in corpus
]
corpus_processed

['thor eat pizza', 'Loki tall', 'Loki eat pizza']

In [35]:
v = CountVectorizer(ngram_range=(1,2))
v.fit(corpus_processed)
v.vocabulary_

{'thor': 7,
 'eat': 0,
 'pizza': 5,
 'thor eat': 8,
 'eat pizza': 1,
 'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'loki eat': 3}

In [36]:
v.transform(["Thor eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 1, 1]], dtype=int64)

In [37]:
import pandas as pd

df = pd.read_json(r"C:\code\nlp-tutorials-main\11_bag_of_n_grams\news_dataset.json", lines=True)
print(df.shape)

df.head(5)

(1, 2)


Unnamed: 0,text,category
0,{'0': 'Watching Schrödinger's Cat Die Universi...,"{'0': 'SCIENCE', '1': 'SCIENCE', '2': 'BUSINES..."


In [38]:
df.category.value_counts()

{'0': 'SCIENCE', '1': 'SCIENCE', '2': 'BUSINESS', '3': 'BUSINESS', '4': 'CRIME', '5': 'BUSINESS', '6': 'SPORTS', '7': 'BUSINESS', '8': 'CRIME', '9': 'SCIENCE', '10': 'BUSINESS', '11': 'BUSINESS', '12': 'CRIME', '13': 'SPORTS', '14': 'SPORTS', '15': 'CRIME', '16': 'SPORTS', '17': 'SPORTS', '18': 'CRIME', '19': 'SPORTS', '20': 'CRIME', '21': 'SPORTS', '22': 'SCIENCE', '23': 'CRIME', '24': 'SPORTS', '25': 'CRIME', '26': 'BUSINESS', '27': 'CRIME', '28': 'BUSINESS', '29': 'BUSINESS', '30': 'SCIENCE', '31': 'CRIME', '32': 'BUSINESS', '33': 'BUSINESS', '34': 'SPORTS', '35': 'SPORTS', '36': 'CRIME', '37': 'BUSINESS', '38': 'CRIME', '39': 'SCIENCE', '40': 'CRIME', '41': 'CRIME', '42': 'CRIME', '43': 'BUSINESS', '44': 'SPORTS', '45': 'CRIME', '46': 'SPORTS', '47': 'CRIME', '48': 'BUSINESS', '49': 'SPORTS', '50': 'BUSINESS', '51': 'BUSINESS', '52': 'SCIENCE', '53': 'BUSINESS', '54': 'BUSINESS', '55': 'BUSINESS', '56': 'SPORTS', '57': 'BUSINESS', '58': 'SPORTS', '59': 'CRIME', '60': 'SPORTS', '61'

In [39]:
min_samples = 1381 # we have these many SCIENCE articles and SCIENCE is our minority class


df_business = df[df.category=="BUSINESS"].sample(min_samples, random_state=2022)
df_sports = df[df.category=="SPORTS"].sample(min_samples, random_state=2022)
df_crime = df[df.category=="CRIME"].sample(min_samples, random_state=2022)
df_science = df[df.category=="SCIENCE"].sample(min_samples, random_state=2022)

ValueError: a must be greater than 0 unless no samples are taken