In [None]:
import pandas as pd
import nltk
nltk.download('stopwords')
import numpy as np
import string
import re

## ***Preprocessing***



In [None]:
data = pd.read_csv('/kaggle/input/amazon-reviews/train.csv')

# First Phase : EDA

In [None]:
data.info()

In [None]:
data.shape

In [None]:
data.head()

## as we see the column names are consiedred as rows so we will convert them to rows and handling the columns name

In [None]:
data=pd.DataFrame(np.vstack([data.columns, data]))
data.columns = ['Polarity', 'Title', 'Review']
print(data.shape)
data.head()

## after the previous step  the value of index 0 at 'Polarity' column will be be considered as object so we should handle it

In [None]:
data['Polarity'] = data['Polarity'].astype(int)

In [None]:
data.info()

In [None]:
data = data[['Polarity', 'Review']].reset_index(drop=True)
data.head(15)

In [None]:
# in this case the same review with same class is repeated
data.duplicated().sum()

# we will drop all duplicates

In [None]:
data.drop_duplicates(inplace=True)
data.duplicated().sum()

In [None]:
data.shape

## in the below case the same review may have <span style="background:yellow" >different class</span> and this will distract the model *so we will remove the redundant row and the original from the data then we will annotate it* and then we will merge it to the original dataset

In [None]:
data["Review"].duplicated().sum()

In [None]:
data["Review"].drop_duplicates(keep=False,inplace=True)
data.shape

In [None]:
duplicated_data=data[data["Review"].duplicated(keep=False)]
duplicated_data

# the duplicated data with human annotation

In [None]:
unique_data=pd.read_csv('/kaggle/input/unique/data.csv')
unique_data.shape

In [None]:
unique_data.head()

In [None]:
data=pd.DataFrame(np.vstack([unique_data, data]))
data.columns = ['Polarity', 'Review']
data.head()

# the column 'Polarity' will be as object 

In [None]:
data.info()

In [None]:
data['Polarity'] = data['Polarity'].astype(int)

## check the target class whether <span style="background: yellow">Balanced or Not</span>

In [None]:
value_counts=data.Polarity.value_counts()
value_counts

In [None]:
value_counts.plot(kind="bar",x=value_counts.keys,colormap='viridis')

## check null values

In [None]:
# Count the null values
data.isnull().sum()

No null values

In [None]:
data.info()

# Second Pahse: Cleaning Phase

In [None]:
## this function is for loweringcase all the words
def lower(text):
    ## we want to split the words of the sentence by split() to work with each word individually
    words = text.split()
    ## we created a new list to save all the lowercase words and we converted it by lower() method
    lower = [word.lower() for word in words]
    ## after finishing we join them back by join() method
    return ' '.join(lower)
## applying the function on the feature Review
data['Review']= data['Review'].apply(lambda x:lower(x))
data.head()

In [None]:
## this function is for removing hyperlinks
def hyperlinks(text):
    ## this pattern follows any url
    pattern = r'http\S+|www\S+'
    ## re.sub() is used for substituting all the links with spaces
    removed = re.sub(pattern, '', text)
    return removed
## applying the function on the feature Review
data['Review']= data['Review'].apply(lambda x:hyperlinks(x))

In [None]:
## defining a function for removing tabs between words
def remove_large_spaces(text):
    ## this pattern is for tabs
    pattern = r'\s+'
    # Remove tabs using regex substitution with spaces
    removed_spaces = re.sub(pattern, ' ', text)
    ## the strip method is used to remove any leading spaces after substitution
    return removed_spaces.strip()
data['Review']= data['Review'].apply(lambda x:remove_large_spaces(x))

In [None]:
# show th stopwords
stopword = nltk.corpus.stopwords.words('english')
print(stopword)

In [None]:
# defining a function to remove stopwords
def remove_stopwords(text):
    # checking if the word in the sentences contain stop words or not and save it
    text=' '.join([word for word in text.split() if word not in stopword])
    return text
data['Review'] = data['Review'].apply(lambda x: remove_stopwords(x))

In [None]:
#we'll import string library as it already contains pre-defined punctuations
import string
string.punctuation
#defining the function to remove punctuations
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree
#storing the punctuation-free text
data['Review']= data['Review'].apply(lambda x:remove_punctuation(x))

### Note we need to converting the below data to lower case

In [None]:
## definig a function to remove non word characters
def remove_non_word_characters(sentence):
    # Regex pattern to match non-word characters
    pattern = r'\W+'
    # Remove non-word characters using regex substitution with spaces
    cleaned_sentence = re.sub(pattern, ' ', sentence)
    return cleaned_sentence
data['Review'] = data['Review'].apply(remove_non_word_characters)

In [None]:
## this function is for removing the numbers in text data
def remove_numbers(text):
    ## this pattern in special for numbers
    pattern = r'\d+'
    # Remove numbers using regex substitution with spaces
    removed_numbers = re.sub(pattern, '', text)
    return removed_numbers
data['Review']= data['Review'].apply(lambda x:remove_numbers(x))

In [None]:
## this function as for removing any html tag
def remove_html(text):
    html_re = re.compile(r'<.*?>')
    # create regex for html tag
    text = re.sub(html_re, '', text)
    return text
data['Review']= data['Review'].apply(lambda x:remove_html(x))

In [None]:
## this function is for removing date and time from the texts
def remove_date_time(text):
    # this patterns match date and time formats
    # Matches MM/DD/YYYY or MM/DD/YY
    date_pattern = r"\d{1,2}/\d{1,2}/\d{2,4}"
     # Matches HH:MM or HH:MMAM/HH:MMPM
    time_pattern = r"\d{1,2}:\d{2}([AP]M)?"
    # Remove date and time patterns from the text
    text_without_date = re.sub(date_pattern, "", text)
    text_without_date_time = re.sub(time_pattern, "", text_without_date)
    return text_without_date_time
data['Review']= data['Review'].apply(lambda x:remove_date_time(x))

In [None]:
## this function is for removing mentions and hashtags from the texts
def remove_mentions_hashtags(text):
    # Remove mentions
    text_without_mentions = re.sub(r"@\w+", "", text)
    # Remove hashtags
    text_without_mentions_hashtags = re.sub(r"#\w+", "", text_without_mentions)
    return text_without_mentions_hashtags
data['Review']= data['Review'].apply(lambda x:remove_mentions_hashtags(x))

In [None]:
data['Review'] = data['Review'].apply(lambda x: nltk.word_tokenize(x))

data.head()

In [None]:
data.head()

## Save the cleaned data

In [None]:
data.to_csv('CleanedTrain.zip', index=False, compression=dict(method='zip', archive_name='CleanedTrain.csv'))

## End of final cleaning