# Notebook 1: Cleaning & Saving Clean Text

### Importing Needed Libraries

In [None]:
# For data cleaning
import pandas as pd

# For language detection
from langdetect import detect

### Step 1: Load the Data

In [2]:
df = pd.read_csv(r"C:\book-sentiment-project\data\Book Reviews.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Book,Review,Review Date
0,0,To Kill a Mockingbird,/// gentle reminder that this is not the time ...,"March 24, 2022"
1,1,To Kill a Mockingbird,\n|\n|6.0 stars. I know I am risking a serious...,"May 24, 2011"
2,2,To Kill a Mockingbird,\n|\n|Looking for a new book but don't want to...,"December 10, 2020"
3,3,To Kill a Mockingbird,"To Kill a Mockingbird, Harper Lee|To Kill a Mo...","July 1, 2022"
4,4,To Kill a Mockingbird,Why is it when I pick up | To Kill A Mockingbi...,"October 25, 2009"


### Step 2: Clean the Data

In [10]:
# Check number of rows and columns
df.shape

(31772, 4)

In [3]:
# Remove unwanted characters
df["Review"] = df["Review"].str.replace("\n", "", regex=True)
df["Review"] = df["Review"].str.replace("[/|]", "", regex=True)
df["Review"] = df["Review"].str.strip()
df.head()

Unnamed: 0.1,Unnamed: 0,Book,Review,Review Date
0,0,To Kill a Mockingbird,gentle reminder that this is not the time to r...,"March 24, 2022"
1,1,To Kill a Mockingbird,6.0 stars. I know I am risking a serious “FILM...,"May 24, 2011"
2,2,To Kill a Mockingbird,Looking for a new book but don't want to commi...,"December 10, 2020"
3,3,To Kill a Mockingbird,"To Kill a Mockingbird, Harper LeeTo Kill a Moc...","July 1, 2022"
4,4,To Kill a Mockingbird,Why is it when I pick up To Kill A Mockingbir...,"October 25, 2009"


In [4]:
# Checking if there are any nulls
df.isnull().sum()

# There are 309 rows missing in the "Review" column

Unnamed: 0       0
Book             0
Review         309
Review Date      0
dtype: int64

In [5]:
# Remove rows where "Review" column is null and update df
df = df.dropna(subset=["Review"])

In [6]:
# Checking if all nulls were dropped
df.isnull().sum()

Unnamed: 0     0
Book           0
Review         0
Review Date    0
dtype: int64

In [7]:
def detect_language(text):
    try:
        # Try to detect language of the input text
        return detect(text)
    except:
        # If an error occurs, return 'xx'
        return 'xx'

# Create a full copy of the original df to safely work with it
df_with_lan = df.copy()

# Apply the detect_language function to each review
# and store the result in a new column called 'language'
df_with_lan['language'] = df_with_lan['Review'].apply(detect_language)

df_with_lan.head()

Unnamed: 0.1,Unnamed: 0,Book,Review,Review Date,language
0,0,To Kill a Mockingbird,gentle reminder that this is not the time to r...,"March 24, 2022",en
1,1,To Kill a Mockingbird,6.0 stars. I know I am risking a serious “FILM...,"May 24, 2011",en
2,2,To Kill a Mockingbird,Looking for a new book but don't want to commi...,"December 10, 2020",en
3,3,To Kill a Mockingbird,"To Kill a Mockingbird, Harper LeeTo Kill a Moc...","July 1, 2022",fa
4,4,To Kill a Mockingbird,Why is it when I pick up To Kill A Mockingbir...,"October 25, 2009",en


In [None]:
# pd.set_option('display.max_colwidth', None) 
# df_with_lan.head()

In [8]:
# Keep only the English reviews
df_with_lan = df_with_lan[df_with_lan['language'] == 'en']

# Reset Index
df_with_lan.reset_index(drop=True, inplace=True)

df_with_lan.head()

Unnamed: 0.1,Unnamed: 0,Book,Review,Review Date,language
0,0,To Kill a Mockingbird,gentle reminder that this is not the time to r...,"March 24, 2022",en
1,1,To Kill a Mockingbird,6.0 stars. I know I am risking a serious “FILM...,"May 24, 2011",en
2,2,To Kill a Mockingbird,Looking for a new book but don't want to commi...,"December 10, 2020",en
3,4,To Kill a Mockingbird,Why is it when I pick up To Kill A Mockingbir...,"October 25, 2009",en
4,5,To Kill a Mockingbird,I had a much longer review written for this bo...,"December 17, 2020",en


In [None]:
new_df = df_with_lan.drop(columns=['language'])
new_df.head(10)

Unnamed: 0.1,Unnamed: 0,Book,Review,Review Date
0,0,To Kill a Mockingbird,gentle reminder that this is not the time to r...,"March 24, 2022"
1,1,To Kill a Mockingbird,6.0 stars. I know I am risking a serious “FILM...,"May 24, 2011"
2,2,To Kill a Mockingbird,Looking for a new book but don't want to commi...,"December 10, 2020"
3,4,To Kill a Mockingbird,Why is it when I pick up To Kill A Mockingbir...,"October 25, 2009"
4,5,To Kill a Mockingbird,I had a much longer review written for this bo...,"December 17, 2020"
5,7,To Kill a Mockingbird,With endless books and infinitely more to be w...,"March 11, 2019"
6,8,To Kill a Mockingbird,While the plot was very gripping and well-writ...,"April 18, 2012"
7,9,To Kill a Mockingbird,"In the course of 5 years, I’ve read this book ...","May 4, 2015"
8,10,To Kill a Mockingbird,So... I don't really know what to say.I think ...,"November 12, 2015"
9,11,To Kill a Mockingbird,Beautiful book.,"October 20, 2016"


In [None]:
# Check number of rows and columns
new_df.shape

(23056, 4)

### Step 3: Save Cleaned Data to CSV File

In [16]:
new_df.to_csv("data/cleaned_data.csv", index=False)