In [1]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy import displacy

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nlp=spacy.load('en_core_web_sm')

In [3]:
text="This is first sentence.and this is another one. here 3rd one."

In [4]:
doc=nlp(text)

In [5]:
nlp.add_pipe('sentencizer',before='parser')


<spacy.pipeline.sentencizer.Sentencizer at 0x1fcdadfae00>

In [6]:
doc=nlp(text)

In [7]:
for sent in doc.sents:
    print(sent)

This is first sentence.and this is another one.
here 3rd one.


In [8]:
from spacy.lang.en.stop_words import STOP_WORDS

In [9]:
stopwords=list(STOP_WORDS)

In [10]:
print(stopwords)

['anyhow', 'then', 'whereas', 'via', 'over', 'fifteen', 'even', '‘m', 'may', 'so', '’ve', 'almost', 'somewhere', "'s", 'hereby', 'serious', 'always', 'along', 'put', 'regarding', 'sometime', 'at', 'both', 'already', 'have', 'thru', 'and', 'eleven', 'hers', 'doing', 'this', 'move', 'do', 'but', 'used', 'will', 'through', 'an', 'say', 'seem', 'front', 'he', 'please', 'formerly', 'these', "'re", 'other', 'therein', 'before', '‘s', 'all', 're', 'quite', 'few', 'for', 'full', 'ten', 'wherever', 'ourselves', 'without', 'afterwards', 'whence', 'hundred', 'us', 'becoming', 'it', 'into', 'only', 'elsewhere', 'something', 'between', 'very', 'its', 'latter', 'are', "n't", 'together', 'twenty', 'third', 'cannot', 'using', 'eight', 'side', 'neither', 'own', 'within', 'across', 'however', 'such', 'much', 'beyond', 'hence', 'throughout', 'yourselves', 'she', 'everywhere', 'whatever', 'does', 'a', 'go', 'amongst', 'amount', 'is', 'everyone', 'from', 'our', 'down', 'again', 'sixty', 'top', 'nothing', '

In [11]:
len(stopwords)

326

Lemmatization

In [12]:
doc=nlp('run runs running runner')

In [13]:
for lem in doc:
    print(lem.text,lem.lemma_)

run run
runs run
running run
runner runner


Part of Speech

In [14]:
doc=nlp('All is well at your end!')

In [15]:
for token in doc:
    print(token.text,token.pos_)

All PRON
is AUX
well ADV
at ADP
your PRON
end NOUN
! PUNCT


In [16]:
displacy.render(doc,style='dep')

Entity Detection

In [17]:
doc = nlp("WASHINGTON — President Biden and his extended family arrived late Friday for a weeklong Lake Tahoe vacation at the $18 million waterfront mansion of billionaire climate investor Tom Steyer.Biden, 80, and his scandal-plagued son Hunter, 53, showed up separately to the six-bedroom Nevada retreat, which is touted on Redfin as “one of Lake Tahoe’s finest lakefront properties and the pinnacle waterfront estate within the gated Glenbrook community.”The home features a “spectacular lakeside gazebo[that] rests between the park-like grounds and the calming shores of Glenbrook Bay, ” the real estate website says.The president has enjoyed free vacations at the homes of other prominent Democrats, but the White House said that he will pay for the stay at Steyer’s three-acre property")


In [18]:
doc

WASHINGTON — President Biden and his extended family arrived late Friday for a weeklong Lake Tahoe vacation at the $18 million waterfront mansion of billionaire climate investor Tom Steyer.Biden, 80, and his scandal-plagued son Hunter, 53, showed up separately to the six-bedroom Nevada retreat, which is touted on Redfin as “one of Lake Tahoe’s finest lakefront properties and the pinnacle waterfront estate within the gated Glenbrook community.”The home features a “spectacular lakeside gazebo[that] rests between the park-like grounds and the calming shores of Glenbrook Bay, ” the real estate website says.The president has enjoyed free vacations at the homes of other prominent Democrats, but the White House said that he will pay for the stay at Steyer’s three-acre property

In [19]:
displacy.render(doc,style='ent')

TEXT CLASSIFICATION

In [20]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [21]:
#SOME GENERATE DATA
data = {
    "Review": [
        "The product was terrible. It broke within a week.",
        "I'm quite disappointed with this purchase. It didn't live up to my expectations.",
        "It's an average product. Not great, but not terrible either.",
        "I'm satisfied with my purchase. The product met my needs.",
        "This is an excellent product! It exceeded my expectations in every way.",
        "I regret buying this. It's not worth the price.",
        "The quality of the product is subpar. I wouldn't recommend it.",
        "It's a decent product for the price. Gets the job done.",
        "I'm pleasantly surprised by how good this product is.",
        "This product is a game-changer! I can't imagine life without it.",
        "The customer service was awful. I had a terrible experience.",
        "They were slow to respond to my inquiries. Not impressed.",
        "The customer service was satisfactory, but nothing exceptional.",
        "I had a positive experience with their customer service. They were helpful.",
        "Their customer service is top-notch! They went above and beyond.",
        "The shipping was a nightmare. My package arrived damaged.",
        "The shipping took longer than expected. It was quite frustrating.",
        "The shipping process was average. No major complaints.",
        "My order arrived on time and in good condition. Happy with the shipping.",
        "Superb shipping! My package was well-packaged and arrived quickly.",
        "The product was a complete waste of money. I'm very disappointed.",
        "I expected more considering the price. This product isn't worth it.",
        "It's an okay product, but not great value for the money.",
        "I feel like I got my money's worth with this purchase.",
        "Best value for money! This product exceeded my expectations."
    ],
    "Rating": [
        1, 2, 3, 4, 5,
        1, 2, 3, 4, 5,
        1, 2, 3, 4, 5,
        1, 2, 3, 4, 5,
        1, 2, 3, 4, 5
    ]
}

df = pd.DataFrame(data)
# More examples of reviews and ratings
more_data = {
    "Review": [
        "I wouldn't recommend this product to anyone. It's a complete letdown.",
        "The quality of this item is impressive. It's worth every penny.",
        "I'm on the fence about this product. It has both pros and cons.",
        "The overall experience was mediocre. Nothing stood out.",
        "I'm in awe of this product's performance. It's a real game-changer.",
        "I encountered numerous issues with this purchase. Very frustrating.",
        "The packaging was terrible. It arrived all smashed up.",
        "Shipping was surprisingly fast, even though I had doubts.",
        "This product exceeded my expectations. I'm pleasantly surprised.",
        "The customer service needs improvement. They were unhelpful.",
        "I'm extremely satisfied with my purchase. It's worth every penny.",
        "The shipping process was a breeze. The package came in perfect condition.",
        "This product is overpriced for what you get. Disappointed.",
        "The company's customer service was a lifesaver. So understanding.",
        "The shipping experience was lackluster. It took ages to arrive.",
        "I can't imagine my life without this product. It's that good!",
        "This purchase was a mistake. I should've chosen a different product.",
        "The shipping speed was alright. Nothing to complain about.",
        "I'm overjoyed with this product. It's exceeded all my expectations.",
        "The customer service was prompt and helpful. Kudos to them.",
        "This is the worst product I've ever bought. Absolute garbage.",
        "For the price, this product is a steal! I'm pleasantly surprised.",
        "The shipping was flawless and quick. I'm really impressed.",
        "I had high hopes, but this product fell short. Quite disappointing.",
        "This purchase was worth every cent. I'm a happy customer."
    ],
    "Rating": [
        1, 5, 3, 2, 5,
        1, 2, 4, 5, 2,
        5, 4, 2, 4, 1,
        4, 2, 3, 5, 4,
        1, 5, 5, 2, 5
    ]
}

df_more = pd.DataFrame(more_data)

# Concatenate the two DataFrames
df_combined = pd.concat([df, df_more], ignore_index=True)

print(df_combined)


                                               Review  Rating
0   The product was terrible. It broke within a week.       1
1   I'm quite disappointed with this purchase. It ...       2
2   It's an average product. Not great, but not te...       3
3   I'm satisfied with my purchase. The product me...       4
4   This is an excellent product! It exceeded my e...       5
5     I regret buying this. It's not worth the price.       1
6   The quality of the product is subpar. I wouldn...       2
7   It's a decent product for the price. Gets the ...       3
8   I'm pleasantly surprised by how good this prod...       4
9   This product is a game-changer! I can't imagin...       5
10  The customer service was awful. I had a terrib...       1
11  They were slow to respond to my inquiries. Not...       2
12  The customer service was satisfactory, but not...       3
13  I had a positive experience with their custome...       4
14  Their customer service is top-notch! They went...       5
15  The 

In [22]:
columns_name = ['Review', 'Sentiment']
df_combined.columns=columns_name

In [23]:
df_combined.head()

Unnamed: 0,Review,Sentiment
0,The product was terrible. It broke within a week.,1
1,I'm quite disappointed with this purchase. It ...,2
2,"It's an average product. Not great, but not te...",3
3,I'm satisfied with my purchase. The product me...,4
4,This is an excellent product! It exceeded my e...,5


Tokenization

In [24]:
import string

In [25]:
punct=string.punctuation
punct

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [26]:
def text_data_cleaning(sentence):
    doc=nlp(sentence)

    tokens=[]
    for token in doc:
        if token.lemma_ !="-PRON-":
            temp=token.lemma_.lower().strip()
        else:
            temp=token.lower_
        tokens.append(temp)
    cleaned_tokens=[]
    for token in tokens:
        if token not in stopwords and token not in punct:
            cleaned_tokens.append(token)
    return cleaned_tokens

In [27]:
text_data_cleaning( "     Hello how are you.Like this video")

['hello', 'like', 'video']

Vectorization Feature Engineering

In [28]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline


In [29]:
tfidf=TfidfVectorizer(tokenizer=text_data_cleaning)
classifer=LinearSVC()

In [30]:
X=df_combined['Review']
y=df_combined['Sentiment']

In [31]:
clf=Pipeline([('tfidf',tfidf),('clf',classifer)])

In [32]:
clf.fit(X,y)



In [33]:
test_data = {
    "Review": [
        "The product exceeded my expectations. I'm thoroughly impressed.",
        "It's of poor quality.",
        "Decent product for the price. It serves its purpose.",
    ],
    "Rating": [5, 2, 3]
}

df_test = pd.DataFrame(test_data)


In [34]:
X_test=df_test['Review']

In [35]:
y_pred=clf.predict(X_test)

In [36]:
print(classification_report(df_test['Rating'],y_pred))

              precision    recall  f1-score   support

           2       1.00      1.00      1.00         1
           3       1.00      1.00      1.00         1
           5       1.00      1.00      1.00         1

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3



In [37]:
confusion_matrix(df_test['Rating'],y_pred)

array([[1, 0, 0],
       [0, 1, 0],
       [0, 0, 1]], dtype=int64)