<a href="https://colab.research.google.com/github/tharun7781/Data-Science-Codes/blob/main/NLP_Basics_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [84]:
import pandas as pd

# Sample data with special characters and reviews
data = {
    "review": [
        "Absolutely loved this product! üòç Works perfectly & exceeded expectations!",
        "Worst purchase ever... üò° Totally disappointed & waste of money!!!",
        "Great value for the price üëç but packaging was a bit damaged üòï"
    ],
    "sentiment": ["positive", "negative", "positive"]
}

# Create dataframe
df = pd.DataFrame(data)
df

Unnamed: 0,review,sentiment
0,Absolutely loved this product! üòç Works perfect...,positive
1,Worst purchase ever... üò° Totally disappointed ...,negative
2,Great value for the price üëç but packaging was ...,positive


In [85]:
df['lowercase']=df['review'].str.lower()
df.head()

Unnamed: 0,review,sentiment,lowercase
0,Absolutely loved this product! üòç Works perfect...,positive,absolutely loved this product! üòç works perfect...
1,Worst purchase ever... üò° Totally disappointed ...,negative,worst purchase ever... üò° totally disappointed ...
2,Great value for the price üëç but packaging was ...,positive,great value for the price üëç but packaging was ...


In [86]:
import re
df['clean_specials'] = df['lowercase'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '',x))
df

Unnamed: 0,review,sentiment,lowercase,clean_specials
0,Absolutely loved this product! üòç Works perfect...,positive,absolutely loved this product! üòç works perfect...,absolutely loved this product works perfectly...
1,Worst purchase ever... üò° Totally disappointed ...,negative,worst purchase ever... üò° totally disappointed ...,worst purchase ever totally disappointed was...
2,Great value for the price üëç but packaging was ...,positive,great value for the price üëç but packaging was ...,great value for the price but packaging was a...


In [87]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt_tab')
df['tokens'] = df['clean_specials'].apply(word_tokenize)
df.head()

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,review,sentiment,lowercase,clean_specials,tokens
0,Absolutely loved this product! üòç Works perfect...,positive,absolutely loved this product! üòç works perfect...,absolutely loved this product works perfectly...,"[absolutely, loved, this, product, works, perf..."
1,Worst purchase ever... üò° Totally disappointed ...,negative,worst purchase ever... üò° totally disappointed ...,worst purchase ever totally disappointed was...,"[worst, purchase, ever, totally, disappointed,..."
2,Great value for the price üëç but packaging was ...,positive,great value for the price üëç but packaging was ...,great value for the price but packaging was a...,"[great, value, for, the, price, but, packaging..."


In [88]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [89]:
from nltk.corpus import stopwords
stop_words=set(stopwords.words('english'))
df['no_stopwords'] =df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])
df.head()

Unnamed: 0,review,sentiment,lowercase,clean_specials,tokens,no_stopwords
0,Absolutely loved this product! üòç Works perfect...,positive,absolutely loved this product! üòç works perfect...,absolutely loved this product works perfectly...,"[absolutely, loved, this, product, works, perf...","[absolutely, loved, product, works, perfectly,..."
1,Worst purchase ever... üò° Totally disappointed ...,negative,worst purchase ever... üò° totally disappointed ...,worst purchase ever totally disappointed was...,"[worst, purchase, ever, totally, disappointed,...","[worst, purchase, ever, totally, disappointed,..."
2,Great value for the price üëç but packaging was ...,positive,great value for the price üëç but packaging was ...,great value for the price but packaging was a...,"[great, value, for, the, price, but, packaging...","[great, value, price, packaging, bit, damaged]"


In [90]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [91]:
df.columns

Index(['review', 'sentiment', 'lowercase', 'clean_specials', 'tokens',
       'no_stopwords'],
      dtype='object')

In [92]:
pip install emoji




In [93]:
#emoji
import emoji
df['emoji_to_text'] = df['review'].apply(lambda x: emoji.demojize(x))
# emoticons
emoticon_dict = { ":-)": "smile",":-D":"laugh",":-(": "sad"}
def replace_emoticons(text):
    for k,v in emoticon_dict.items():
        text = text.replace(k,v)
    return text
df['emoji_to_text'] = df['emoji_to_text'].apply(replace_emoticons)

#lemma
lemmatizer = WordNetLemmatizer()
df['lemmatized'] = df['no_stopwords'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# stemming
stmmer = PorterStemmer()
df['stemmed'] = df['lemmatized'].apply(lambda x: [stmmer.stem(word) for word in x])
df.head()

Unnamed: 0,review,sentiment,lowercase,clean_specials,tokens,no_stopwords,emoji_to_text,lemmatized,stemmed
0,Absolutely loved this product! üòç Works perfect...,positive,absolutely loved this product! üòç works perfect...,absolutely loved this product works perfectly...,"[absolutely, loved, this, product, works, perf...","[absolutely, loved, product, works, perfectly,...",Absolutely loved this product! :smiling_face_w...,"[absolutely, loved, product, work, perfectly, ...","[absolut, love, product, work, perfectli, exce..."
1,Worst purchase ever... üò° Totally disappointed ...,negative,worst purchase ever... üò° totally disappointed ...,worst purchase ever totally disappointed was...,"[worst, purchase, ever, totally, disappointed,...","[worst, purchase, ever, totally, disappointed,...",Worst purchase ever... :enraged_face: Totally ...,"[worst, purchase, ever, totally, disappointed,...","[worst, purchas, ever, total, disappoint, wast..."
2,Great value for the price üëç but packaging was ...,positive,great value for the price üëç but packaging was ...,great value for the price but packaging was a...,"[great, value, for, the, price, but, packaging...","[great, value, price, packaging, bit, damaged]",Great value for the price :thumbs_up: but pack...,"[great, value, price, packaging, bit, damaged]","[great, valu, price, packag, bit, damag]"


In [94]:
df.columns

Index(['review', 'sentiment', 'lowercase', 'clean_specials', 'tokens',
       'no_stopwords', 'emoji_to_text', 'lemmatized', 'stemmed'],
      dtype='object')

In [101]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer()
tf.fit(df['clean_specials'])
tf.transform(df['clean_specials']).toarray()

array([[0.35355339, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.35355339, 0.35355339, 0.        , 0.        ,
        0.35355339, 0.        , 0.        , 0.        , 0.35355339,
        0.        , 0.35355339, 0.        , 0.        , 0.35355339,
        0.        , 0.        , 0.        , 0.        , 0.35355339,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.35355339,
        0.35355339, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.35355339, 0.35355339, 0.        , 0.        ,
        0.        , 0.        , 0.35355339, 0.        , 0.        ,
        0.35355339, 0.        , 0.        , 0.35355339, 0.        ,
        0.35355339],
       [0.        , 0.31622777, 0.31622777, 0.31622777, 0.        ,
        0.        , 0.        , 0.        , 0.31622777, 0.31622777,
        0.        , 0.        , 0.        , 0.31622777, 0.        ,
        0.31622777, 0.        , 0.        , 0.31622777, 0.        ,
      

In [102]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
cv.fit(df['clean_specials'])
cv.transform(df['clean_specials']).toarray()

array([[1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
        0, 0, 1, 0],
       [0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
        0, 1, 0, 1],
       [0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1,
        1, 0, 0, 0]])

In [97]:
df.columns

Index(['review', 'sentiment', 'lowercase', 'clean_specials', 'tokens',
       'no_stopwords', 'emoji_to_text', 'lemmatized', 'stemmed'],
      dtype='object')