In [1]:
import pandas as pd
import re
from langdetect import detect

In [2]:
df = pd.read_csv('reviews.csv', sep=';', encoding='utf-8', index_col='Unnamed: 0')
df

Unnamed: 0,review,title,tag,rating
0,"\n\n\n\n\n\n Cringe ""comedy"" in...",Amagi Brilliant Park,Not RecommendedPreliminary\n \n...,2
1,\n\n\n\n\n\n So this is the thi...,Higeki no Genkyou to Naru Saikyou Gedou Last B...,Mixed Feelings,5
2,\n\n\n\n\n\n The show is depres...,Sousou no Frieren,RecommendedPreliminary\n \n ...,9
3,"\n\n\n\n\n\n Ugly CGI, bland vi...",Kimetsu no Yaiba,Not Recommended,3
4,\n\n\n\n\n\n Boring. Cliche. La...,Ousama Ranking,Not Recommended,2
...,...,...,...,...
242138,\n\n\n\n\n\n A comedy/romance b...,Ranma ½,Recommended,7
242139,\n\n\n\n\n\n Saiyuki is one of ...,Gensoumaden Saiyuuki,Recommended,9
242140,"\n\n\n\n\n\n First, let me say ...",Hajime no Ippo,Recommended,10
242141,\n\n\n\n\n\n Utawarerumono mana...,Utawarerumono,Recommended,8


Removing extra data and symbols from the "tag" column.

In [3]:
new_tag_list = []
for tag in list(df.tag):
    if tag.startswith('Recommended'):
        new_tag = 'Recommended'
    elif tag.startswith('Mixed Feelings'):
        new_tag = 'Mixed Feelings'
    elif tag.startswith('Not Recommended'):
        new_tag = 'Not Recommended'
    new_tag_list.append(new_tag)

df.tag = new_tag_list
df

Unnamed: 0,review,title,tag,rating
0,"\n\n\n\n\n\n Cringe ""comedy"" in...",Amagi Brilliant Park,Not Recommended,2
1,\n\n\n\n\n\n So this is the thi...,Higeki no Genkyou to Naru Saikyou Gedou Last B...,Mixed Feelings,5
2,\n\n\n\n\n\n The show is depres...,Sousou no Frieren,Recommended,9
3,"\n\n\n\n\n\n Ugly CGI, bland vi...",Kimetsu no Yaiba,Not Recommended,3
4,\n\n\n\n\n\n Boring. Cliche. La...,Ousama Ranking,Not Recommended,2
...,...,...,...,...
242138,\n\n\n\n\n\n A comedy/romance b...,Ranma ½,Recommended,7
242139,\n\n\n\n\n\n Saiyuki is one of ...,Gensoumaden Saiyuuki,Recommended,9
242140,"\n\n\n\n\n\n First, let me say ...",Hajime no Ippo,Recommended,10
242141,\n\n\n\n\n\n Utawarerumono mana...,Utawarerumono,Recommended,8


Cleaning the dataset from non-English reviews using the langdetect library.

In [11]:
indices_non_en = []

for index, row in df.iterrows():
    try:
        detected_language = detect(row.review)
        if detected_language != 'en':
            indices_non_en.append(index)
    except:
        continue

df.drop(indices_non_en, inplace=True)

Cleaning the reviews from extra symbols and numbers.

In [15]:
def preprocess(text):
    text = re.sub(r'[\n\r\s]+', ' ', text)
    # text = re.sub(r'[\.,\!\?\"\-:;\\\/\=\(\)•★%\*\[\]\$#@…_]+', ' ', text)
    text = re.sub(r'=•★%\*\[\]\$#@…_', ' ', text)
    text = re.sub(r'\’', '\'', text)
    text = re.sub(r'[0-9]+', ' ', text)
    text = re.sub(r'\s\s+', ' ', text)
    return text.strip()

In [18]:
new_review_list = [preprocess(review) for review in list(df.review)]

df.review = new_review_list
df

Unnamed: 0,review,title,tag,rating
0,"Cringe ""comedy"" in a nutshell except there's n...",Amagi Brilliant Park,Not Recommended,2
1,So this is the thing about this anime. **some ...,Higeki no Genkyou to Naru Saikyou Gedou Last B...,Mixed Feelings,5
2,The show is depressing as fuck. But good. As f...,Sousou no Frieren,Recommended,9
3,"Ugly CGI, bland villains, bland one-dimensiona...",Kimetsu no Yaiba,Not Recommended,3
4,Boring. Cliche. Lame. Ousama Ranking is extrem...,Ousama Ranking,Not Recommended,2
...,...,...,...,...
242138,A comedy/romance based on the manga by Rumiko ...,Ranma ½,Recommended,7
242139,Saiyuki is one of those animes that just grabs...,Gensoumaden Saiyuuki,Recommended,9
242140,"First, let me say that I\'m not a fan of boxin...",Hajime no Ippo,Recommended,10
242141,Utawarerumono manages to be one of those harem...,Utawarerumono,Recommended,8


In [19]:
df.to_csv('reviews_clean.csv', sep=';', encoding='utf-8')