In [1]:
import pandas as pd
import re
from langdetect import detect

In [21]:
df = pd.read_csv('reviews.csv', sep=';', encoding='utf-8', index_col='Unnamed: 0')
df

Unnamed: 0,review,title,tag,rating
0,"\n\n\n\n\n\n Cringe ""comedy"" in...",Amagi Brilliant Park,Not RecommendedPreliminary\n \n...,2
1,\n\n\n\n\n\n So this is the thi...,Higeki no Genkyou to Naru Saikyou Gedou Last B...,Mixed Feelings,5
2,\n\n\n\n\n\n The show is depres...,Sousou no Frieren,RecommendedPreliminary\n \n ...,9
3,"\n\n\n\n\n\n Ugly CGI, bland vi...",Kimetsu no Yaiba,Not Recommended,3
4,\n\n\n\n\n\n Boring. Cliche. La...,Ousama Ranking,Not Recommended,2
...,...,...,...,...
242138,\n\n\n\n\n\n A comedy/romance b...,Ranma ½,Recommended,7
242139,\n\n\n\n\n\n Saiyuki is one of ...,Gensoumaden Saiyuuki,Recommended,9
242140,"\n\n\n\n\n\n First, let me say ...",Hajime no Ippo,Recommended,10
242141,\n\n\n\n\n\n Utawarerumono mana...,Utawarerumono,Recommended,8


Removing extra data and symbols from the "tag" column as some rows have not been parsed correctly.

In [22]:
new_tag_list = []
for tag in list(df.tag):
    if tag.startswith('Recommended'):
        new_tag = 'Recommended'
    elif tag.startswith('Mixed Feelings'):
        new_tag = 'Mixed Feelings'
    elif tag.startswith('Not Recommended'):
        new_tag = 'Not Recommended'
    new_tag_list.append(new_tag)

df.tag = new_tag_list
df

Unnamed: 0,review,title,tag,rating
0,"\n\n\n\n\n\n Cringe ""comedy"" in...",Amagi Brilliant Park,Not Recommended,2
1,\n\n\n\n\n\n So this is the thi...,Higeki no Genkyou to Naru Saikyou Gedou Last B...,Mixed Feelings,5
2,\n\n\n\n\n\n The show is depres...,Sousou no Frieren,Recommended,9
3,"\n\n\n\n\n\n Ugly CGI, bland vi...",Kimetsu no Yaiba,Not Recommended,3
4,\n\n\n\n\n\n Boring. Cliche. La...,Ousama Ranking,Not Recommended,2
...,...,...,...,...
242138,\n\n\n\n\n\n A comedy/romance b...,Ranma ½,Recommended,7
242139,\n\n\n\n\n\n Saiyuki is one of ...,Gensoumaden Saiyuuki,Recommended,9
242140,"\n\n\n\n\n\n First, let me say ...",Hajime no Ippo,Recommended,10
242141,\n\n\n\n\n\n Utawarerumono mana...,Utawarerumono,Recommended,8


Cleaning the reviews from extra symbols and numbers.

In [20]:
def preprocess(text):
    text = re.sub(r'[\n\r\s]+', ' ', text)
    # text = re.sub(r'[\.,\!\?\"\-:;\\\/\=\(\)•★%\*\[\]\$#@…_]+', ' ', text)
    text = re.sub(r'[=•★%\*\[\]\$#@…_\"\\\/“”]', ' ', text)
    text = re.sub(r'([.])\1+', r'\1', text)
    text = re.sub(r'\’', '\'', text)
    text = re.sub(r'[0-9]+', ' ', text)
    text = re.sub(r'\s\s+', ' ', text)
    return text.strip()

In [23]:
df['cleaned_review'] = df['review'].apply(preprocess)
df

Unnamed: 0,review,title,tag,rating,cleaned_review
0,"\n\n\n\n\n\n Cringe ""comedy"" in...",Amagi Brilliant Park,Not Recommended,2,Cringe comedy in a nutshell except there's not...
1,\n\n\n\n\n\n So this is the thi...,Higeki no Genkyou to Naru Saikyou Gedou Last B...,Mixed Feelings,5,So this is the thing about this anime. some sp...
2,\n\n\n\n\n\n The show is depres...,Sousou no Frieren,Recommended,9,The show is depressing as fuck. But good. As f...
3,"\n\n\n\n\n\n Ugly CGI, bland vi...",Kimetsu no Yaiba,Not Recommended,3,"Ugly CGI, bland villains, bland one-dimensiona..."
4,\n\n\n\n\n\n Boring. Cliche. La...,Ousama Ranking,Not Recommended,2,Boring. Cliche. Lame. Ousama Ranking is extrem...
...,...,...,...,...,...
242138,\n\n\n\n\n\n A comedy/romance b...,Ranma ½,Recommended,7,A comedy romance based on the manga by Rumiko ...
242139,\n\n\n\n\n\n Saiyuki is one of ...,Gensoumaden Saiyuuki,Recommended,9,Saiyuki is one of those animes that just grabs...
242140,"\n\n\n\n\n\n First, let me say ...",Hajime no Ippo,Recommended,10,"First, let me say that I 'm not a fan of boxin..."
242141,\n\n\n\n\n\n Utawarerumono mana...,Utawarerumono,Recommended,8,Utawarerumono manages to be one of those harem...


In [24]:
df.to_csv('reviews_clean_with_non_en.csv', sep=';', encoding='utf-8') 
# saving a provisional dataframe before removing non-EN reviews as detecting them takes a while

Cleaning the dataset from non-English reviews using the langdetect library.

In [25]:
indices_non_en = []

for index, row in df.iterrows():
    try:
        detected_language = detect(row.cleaned_review)
        if detected_language != 'en':
            indices_non_en.append(index)
    except:
        continue

df.drop(indices_non_en, inplace=True)

In [27]:
df.to_csv('reviews_clean.csv', sep=';', encoding='utf-8')