In [2]:
import pandas as pd
import re
import nltk
from datasets import load_dataset
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vinay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vinay\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vinay\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
dataset = load_dataset('amazon_polarity')

In [5]:
df = pd.DataFrame(dataset['train']).sample(20000,random_state=42)

In [6]:
df

Unnamed: 0,label,title,content
2079998,0,Expensive Junk,This product consists of a piece of thin flexi...
1443106,0,Toast too dark,"Even on the lowest setting, the toast is too d..."
3463669,1,Excellent imagery...dumbed down story,I enjoyed this disc. The video is stunning. I ...
2914699,0,Are we pretending everyone is married?,The authors pretend that parents neither die n...
1603231,0,Not worth your time,"Might as well just use a knife, this product h..."
...,...,...,...
2791818,0,"Great sound, poor construction","I used these headphones at a desk job, listeni..."
1870098,0,"""Stingy and egotistical gambler---not a role m...",Egotism is partly enthusiasm -- but mostly ign...
1766999,1,RED!,I do love the color. Much more vibrant than in...
301354,1,A great silent.,Francesca Bertini was the first diva of Italia...


In [7]:
def clean_text(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r"http\S+|www\S+",'',sentence)
    sentence = re.sub(r'[^a-zA-Z\s]','',sentence)
    sentence = re.sub(r'\s+',' ',sentence).strip()
    return sentence

In [8]:
stop_words = set(stopwords.words('english'))
lem = WordNetLemmatizer()

In [9]:
def pre_process(sentence):
    token = nltk.word_tokenize(clean_text(sentence))
    token = [lem.lemmatize(w) for w in token if w not in stop_words]
    return " ".join(token)
    

In [10]:
df['clean_sentence'] = df['content'].apply(pre_process)
df['label'] = df['label'].map({0:'Negative',1:'Positive'})

In [16]:
df

Unnamed: 0,label,title,content,clean_sentence
2079998,Negative,Expensive Junk,This product consists of a piece of thin flexi...,product consists piece thin flexible insulatin...
1443106,Negative,Toast too dark,"Even on the lowest setting, the toast is too d...",even lowest setting toast dark liking also lig...
3463669,Positive,Excellent imagery...dumbed down story,I enjoyed this disc. The video is stunning. I ...,enjoyed disc video stunning agree others story...
2914699,Negative,Are we pretending everyone is married?,The authors pretend that parents neither die n...,author pretend parent neither die divorce insi...
1603231,Negative,Not worth your time,"Might as well just use a knife, this product h...",might well use knife product hold next nothing...
...,...,...,...,...
2791818,Negative,"Great sound, poor construction","I used these headphones at a desk job, listeni...",used headphone desk job listening ipod npr str...
1870098,Negative,"""Stingy and egotistical gambler---not a role m...",Egotism is partly enthusiasm -- but mostly ign...,egotism partly enthusiasm mostly ignorance jor...
1766999,Positive,RED!,I do love the color. Much more vibrant than in...,love color much vibrant picture lot room like ...
301354,Positive,A great silent.,Francesca Bertini was the first diva of Italia...,francesca bertini first diva italian movie por...


In [11]:
df.to_csv(r'F:\project\intelligent customer sentiment analysis\preprocessed_dataset.csv',index=False)
print("Dataset cleaned")

Dataset cleaned
