# Clean Text 

Before train the model, we have to understand the data and do some works to ensure we have nice and clean data to fit in our model. 

In [1]:
#load libaraies
import sys, os, re, csv, codecs, string, spacy,gensim
import numpy  as np
import pandas as pd
from tqdm import tqdm
from googletrans import Translator

## Load data set

In [2]:
# load data
TRAIN_DATA_FILE='../data/train.csv'
TEST_DATA_FILE='../data/test.csv'

train = pd.read_csv(TRAIN_DATA_FILE)
test  = pd.read_csv(TEST_DATA_FILE)

In [3]:
train.tail()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0
159570,fff46fc426af1f9a,"""\nAnd ... I really don't think you understand...",0,0,0,0,0,0


In [4]:
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [5]:
test['comment_text1']  = test['comment_text']
train['comment_text1'] = train['comment_text']

## clean text data 

we will do the following:
- removing url, ip and punctuation such as ',?!
- replacement: such as change '@' by at, 's by is, 're by are, 'll by will etc
- converting to lowercase
- Fixing word lengthening: rip offs repeated characters more than 2 
- Remove number, time and year
- Replace repeated words
- translate Chinese to English


In [6]:
def reduce_lengthening(text):
    '''
    remove repeated characters
    '''
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

def replace_time_year(text):
    '''
        replace time and year
    '''
    text = re.sub(r"\d{1,2}:\d{1,2}", ' time ', text)
    text = re.sub(r"\d{1,2}:\d{1,2}:\d{1,2}", ' time ', text)
    text = re.sub(r"\d{4}", ' year ', text)
    return text

def replace_ip(text):
    '''
        replace ip address
    '''
    text = re.sub(r"\d+\.\d+\.\d+\.\d+", "ip", text)
    return text

def  replace_number(text):
    '''
     replace number (if n_word smaller than 100)
    '''
    if len(text.split()) < 100:
        text = re.sub('\d+', ' number ', text)
    else:
        text = re.sub('\d+', ' ', text)
    return text

def remove_repeat_word(text):
    '''
    remove repeated word
    '''
    text = re.sub(r'\b(.+)(\s+\1\b)+', r'\1', text)  
    
    #text = ' '.join( [w for w in text.split() if ((len(w)>1) or (w=='i'))] )
    return text

def is_English(text):
    text1 = re.sub(" ", "", text)
    text1 = re.sub("\t", "", text1)
    
    text1 = re.sub('\s','',text1)
    text1 = "".join(text1.split())
    text1 = re.sub(r"[a-zA-Z]+", "Alpha", text1) #replace all english characters by 'Alpha'
    return text1.isalpha()
        

In [7]:
def google_tran(text):
    '''
        tanslate chinese to english using google API
    '''
    word_list = text.split()
    text_length = len(word_list)
    if text_length < 1:
        return text
    tran_times = text_length//100
    tran_text_en = ""
    te = 0
    for t in range(tran_times):
        ts = t*100
        te = (t+1)*100
        translator = Translator()
        short_text = ' '.join(word_list[ts:te])
        try:
            tran_text = translator.translate(short_text,dest='en').text
        except:
            tran_text = short_text
        tran_text_en = tran_text_en+tran_text
    translator = Translator()
    short_text = ' '.join(word_list[te:])
    try:
        tran_text  = translator.translate(short_text,dest='en').text
    except:
        tran_text = short_text
    tran_text_en = tran_text_en+tran_text
    return tran_text_en

def tran_en(text):
    '''
        tanslate chinese to english if not en
    '''
    
    #match = zh_pattern.search(text)
    is_En = is_English(text)
    if not is_En: 
        text = google_tran(text)
        print('.', end='', flush=True)
    return text


In [8]:
def standardize_df(df, text_field):
    
    df[text_field] = df[text_field].str.replace(r"http\S+", " url ")
    df[text_field] = df[text_field].str.replace(r"http", " url ")
    df[text_field] = df[text_field].str.replace(r"@\S+", " email ")
    df[text_field] = df[text_field].str.replace(r"@", " at ")
    df[text_field] = df[text_field].str.replace(r"Image:\S+", " image ")
    df[text_field] = df[text_field].str.replace(r"tags#\S+", " tags ")
    df[text_field] = df[text_field].str.replace(r"Wikipedia_talk\S+", " ")
    df[text_field] = df[text_field].str.lower()
    
    df[text_field] = df[text_field].apply(lambda x: replace_ip(x))
    df[text_field] = df[text_field].apply(lambda x: replace_time_year(x))
    df[text_field] = df[text_field].apply(lambda x: replace_number(x))
    
    df[text_field] = df[text_field].str.replace("what's", "what is ")
    df[text_field] = df[text_field].str.replace("\'s", " ")
    df[text_field] = df[text_field].str.replace('\'ve', " have ")
    df[text_field] = df[text_field].str.replace('don\'t', ' do not ')
    df[text_field] = df[text_field].str.replace('can\'t', ' can not ')
    df[text_field] = df[text_field].str.replace('n\'t', " not ")
    df[text_field] = df[text_field].str.replace('i\'m', "i am ")
    df[text_field] = df[text_field].str.replace(r"\'re", " are ")
    df[text_field] = df[text_field].str.replace(r'\'d', " would ")
    df[text_field] = df[text_field].str.replace(r'\'ll', " will ")
    df[text_field] = df[text_field].str.replace(r' a ', " ")
    df[text_field] = df[text_field].str.replace(r' the ', " ")
    df[text_field] = df[text_field].str.replace(r' an ', " ")
    df[text_field] = df[text_field].str.replace(r' at ', " ")
    df[text_field] = df[text_field].str.replace(r' to ', " ")
    df[text_field] = df[text_field].str.replace(r' or ', " ")
    df[text_field] = df[text_field].str.replace(r' on ', " ")
    df[text_field] = df[text_field].str.replace(r' in ', " ")
    
    df[text_field] = df[text_field].str.replace(r"谢谢", " Thanks ")
    df[text_field] = df[text_field].str.replace(r"很好", " Great ")
    df[text_field] = df[text_field].str.replace(r"你好", " Hello ")
    df[text_field] = df[text_field].str.replace(r"您好", " Hello ")
    df[text_field] = df[text_field].str.replace(r"屌你老母", " motherfucker ")
    df[text_field] = df[text_field].str.replace(r"純血主義", " Korean ethnic nationalism ")
    
    df[text_field] = df[text_field].str.replace(r"肏", " fuck ")
    df[text_field] = df[text_field].str.replace(r"操", " fuck ")
    df[text_field] = df[text_field].str.replace(r"激情", " porn ")
    df[text_field] = df[text_field].str.replace(r"视频", " video ")
    df[text_field] = df[text_field].str.replace(r"呆B", " bitch ")
    df[text_field] = df[text_field].str.replace(r"呆b", " bitch ")
    df[text_field] = df[text_field].str.replace(r"屌", " fuck ")
    df[text_field] = df[text_field].str.replace(r"小姐", " prostitute ")
    df[text_field] = df[text_field].str.replace(r"𨳒你老母个閪", " fuck your mother ")
    df[text_field] = df[text_field].str.replace(r"臭閪生了你这个臭仔", " son of bitch ")
    df[text_field] = df[text_field].str.replace(r"屌你老母閪", " fuck your mother ")
    df[text_field] = df[text_field].str.replace(r"臭妈的烂B", " fuck your mother ")
    df[text_field] = df[text_field].str.replace(r"你的母親混蛋", " your mother bitch ")
    
    df[text_field] = df[text_field].str.replace('[,="\.\]\[\?\!\/\'\(\)\-|\\\——]+',' ')
    df[text_field] = df[text_field].str.replace('[\+\:;•¢「」¤><#→™]+',' ')
    df[text_field] = df[text_field].str.replace('[\_·✰@“”’‘☎☓%\{\}~]+',' ')
    df[text_field] = df[text_field].str.replace('[♥♠♦♣⋅£☼&´♫₪–°☏]+',' ')
    df[text_field] = df[text_field].str.replace('[\^¿✉←ⓣ˜≈«»☺❤☯➥…¨]+',' ')
    df[text_field] = df[text_field].str.replace('[≠≤（）《》`№©♪♬♩✍¡►®§√∞⁄]+',' ')
    df[text_field] = df[text_field].str.replace('[△⟨⟩∇∆⇔¦⇒└┐æ🗽☿？₰≡╟☥✄−─╢‖]+',' ')

    df[text_field] = df[text_field].str.replace('[🎄🍁☘🍌✎✐●✓★☆☀×◕‿◕♂⁂]+',' ')
    df[text_field] = df[text_field].str.replace('[😂😄😊😢😃😉😅😜😏👅💦]+',' ')
    df[text_field] = df[text_field].str.replace('[☮☸♑☢☣🙈🙉🙊̇☄☽◯☾✋🏼⚇♔💩]+',' ')
    df[text_field] = df[text_field].str.replace('[👍☠😀😔💜⁠۩۞✔♝♚�☛➔⊕⊗]+',' ')
    df[text_field] = df[text_field].str.replace('[🎤✈➨¬➪‡↔↑↓¶✆☑◀▶📞📧✿▲╦╩✽∅]+',' ')
    df[text_field] = df[text_field].str.replace('[☭☻｡¸¯߷♀☃⚔◄☝💬━ºø]+',' ')
    df[text_field] = df[text_field].str.replace('[½⅓⅔¼¾⅛⅜⅝⅞ョ℠∙：▎۝҈♨！☞☜‽┘┌]+',' ')
    df[text_field] = df[text_field].str.replace('[≼≽█ロ◥๛〈⌊〉✤┏┓┃╔═╗║╚╝┗┛↗‹›✭ⓐ㊟]+',' ')
    df[text_field] = df[text_field].str.replace('[○▾↨↕☤➜֑❝❞‑ोǀǃǂǁ¹²³⁴⁰±✌▪₂✘├⇝ٔ✒ʼ❉│‒︵❦❖✗✫⁞『』آ╫ŧ㎥◦■【】]+',' ')
    df[text_field] = df[text_field].str.replace('[〇①②③④⑤。、ａｂ]+',' ')
    df[text_field] = df[text_field].str.replace('[课]+',' class ')
    df[text_field] = df[text_field].str.replace('[山崎]+',' ')
    df[text_field] = df[text_field].str.replace('[词汇]+',' vocabulary ')
    
    df[text_field] = df[text_field].str.replace('[†*$]+',' ')
    
    #text = tran_en(text)
    df[text_field] = df[text_field].apply(lambda x: reduce_lengthening(x))
    return df

def trans_df(df, text_field):
    df[text_field] = df[text_field].apply(lambda x: tran_en(x))
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z]", " ")
    df[text_field] = df[text_field].apply(lambda x: remove_repeat_word(x))
    
    return df

In [9]:
def clean_data(df):
    df.fillna('_na_')
    df =  standardize_df(df, "comment_text")
    df =  trans_df(df, "comment_text")
    
    #convert to list
    list_sentences = df["comment_text"].tolist()
    
    return df, list_sentences

In [10]:
train, list_sentences_train = clean_data(train)

.........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [11]:
test,  list_sentences_test  = clean_data(test)

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [12]:
print("number of training example: {}".format(len(list_sentences_train)))
print("First ten text comments (atfer clean): ")

list_sent = list_sentences_train[0:10]
for i, sent in enumerate(list_sent):
    print('---------------------------------')
    print("[{}]  {}".format(i,sent))

number of training example: 159571
First ten text comments (atfer clean): 
---------------------------------
[0]  explanation why edits made under my username hardcore metallica fan were reverted  they were not  vandalisms  just closure some gas after i voted new york dolls fac  and please  do not  remove template from talk page since i am  retired now ip
---------------------------------
[1]  d aww  he matches this background colour i am  seemingly stuck with  thanks  talk  time  january  number  year  utc 
---------------------------------
[2]  hey man  i am  really not trying edit war  it  just that this guy is constantly removing relevant information and talking me through edits instead of my talk page  he seems care more about formatting than actual info 
---------------------------------
[3]    more i  can not  make any real suggestions improvement  i wondered if section statistics should be later on  subsection of  types of accidents  i think references may need tidying so that 

In [13]:
print("number of training example: {}".format(len(list_sentences_test)))
print("First ten text comments (atfer clean): ")

list_sent = list_sentences_test[0:10]
for i, sent in enumerate(list_sent):
    print('---------------------------------')
    print("[{}]  {}".format(i,sent))

number of training example: 153164
First ten text comments (atfer clean): 
---------------------------------
[0]  yo bitch ja rule is more succesful then you will  ever be whats up with you and hating you sad mofuckas i should bitch slap ur pethedic white faces and get you kiss my ass you guys sicken me  ja rule is about pride da music man  dont diss that shit him  and nothin is wrong bein like tupac he was brother too fuckin white boys get things right next time 
---------------------------------
[1]    from rfc  title is fine as it is  imo 
---------------------------------
[2]        sources  zawe ashton lapland  
---------------------------------
[3]   if you have look back source  information i updated was correct form  i can only guess source had not  updated  i shall update information once again but thank you for your message 
---------------------------------
[4]  i  do not  anonymously edit articles all 
---------------------------------
[5]  thank you for understanding  i th

## Example data after cleaning

In [14]:
train['str_len'] = train['comment_text'].apply(lambda x: len(x.split()))


In [15]:
test['str_len']  = test['comment_text'].apply(lambda x: len(x.split()))

In [16]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text1,str_len
0,0000997932d777bf,explanation why edits made under my username h...,0,0,0,0,0,0,Explanation\nWhy the edits made under my usern...,42
1,000103f0d9cfb60f,d aww he matches this background colour i am ...,0,0,0,0,0,0,D'aww! He matches this background colour I'm s...,19
2,000113f07ec002fd,hey man i am really not trying edit war it ...,0,0,0,0,0,0,"Hey man, I'm really not trying to edit war. It...",38
3,0001b41b1c6bb37e,more i can not make any real suggestions i...,0,0,0,0,0,0,"""\nMore\nI can't make any real suggestions on ...",98
4,0001d958c54c6e35,you sir are my hero any chance you remember...,0,0,0,0,0,0,"You, sir, are my hero. Any chance you remember...",13


In [17]:
max_len = np.max(train['str_len'])
min_len = np.min(train['str_len'])

print(max_len)
print(min_len)

902
1


In [18]:
df_train_non = train[train['str_len'] < 1]
print(df_train_non.shape[0])

for id in range(df_train_non.shape[0]):
    text = df_train_non['comment_text'].iloc[id]
    text_id = df_train_non['id'].iloc[id]

    origin_text = train[train['id']==text_id]['comment_text1'].iloc[0]
    print(text, text_id, origin_text)

0


In [19]:
df_train_non.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text1,str_len


In [20]:
train = train.drop(['comment_text1', 'str_len'], axis=1)
train.to_csv('../data/train_clean.csv', index=False)

## test  set

In [21]:
max_len = np.max(test['str_len'])
min_len = np.min(test['str_len'])

print(max_len)
print(min_len)

1603
0


In [22]:
def tran_en1(text):
    '''
        tanslate chinese to english
    '''
    translator = Translator()
    try:
        text = translator.translate(text,dest='en').text
    except:
        print("can not translate")
    return text

In [23]:
def tran_en2(text):
    '''
        tanslate chinese to english
    '''
    word_list = text.split()
    text_length = len(word_list)
    
    tran_times = text_length//100
    tran_text_en = ""
    te = 0
    for t in range(tran_times):
        ts = t*100
        te = (t+1)*100
        translator = Translator()
        short_text = ' '.join(word_list[ts:te])
        tran_text = translator.translate(short_text,dest='en').text
        tran_text_en = tran_text_en+tran_text
    translator = Translator()
    short_text = ' '.join(word_list[te:])
    tran_text  = translator.translate(short_text,dest='en').text
    tran_text_en = tran_text_en+tran_text
    return tran_text_en

In [24]:
df_test_non = test[test['str_len'] < 1]
print(df_test_non.shape[0])

for id in range(df_test_non.shape[0]):
    text = df_test_non['comment_text'].iloc[id]
    text_id = df_test_non['id'].iloc[id]

    origin_text = test[test['id']==text_id]['comment_text1'].iloc[0]
    trans_text  = tran_en2(origin_text)
    trans_text  = re.sub(r"[^A-Za-z]", " ", trans_text)
    trans_text  = re.sub("[.,.:=/]+", " ", trans_text) 
    
    print('--------------------------------------')
    print(text)
    print('++++++++++')
    print(text_id, origin_text)
    print('==========')
    print(trans_text)
    
    test.loc[test['id'] == text_id, 'comment_text'] = trans_text
    

535
--------------------------------------
                            
++++++++++
000d4f120d5a7303 일이삼사오육칠팔구하고십이요 에헤헤 으헤 으헤 으허허
I have seven years of life and I have ten years old 
--------------------------------------
                                                           
++++++++++
003044a2c35274b6 السلام عليكم و رحمة الله و بركاته الا الجميع 
 تفضلوا جميعا
Peace  mercy and blessings of God be upon you
--------------------------------------
                                        
++++++++++
00b568d3b0f61a37 Радченко ис мы дирты гаы анал сех славе!!
Radchenko and we dirty gai anal se glory   
--------------------------------------
                                                
++++++++++
00d2aca8d65aa590 == ޗޮޮލ! == 

 މަލދިވެސ ިސ ަ ޕރެތތޔ ގޮޮދ ިސލަނދ ނަތިޮނ. ި ސުރެ އޮުލދ ލޮވެ ތޮ ގޮ ތހެރެ އިތހ މޔ ފރިެނދސ!
                                                                                                  
--------------------------------------
                                  

--------------------------------------
                                      
++++++++++
042eb8d7b985aff6 אני מבקש לערוך את הדף לביצוע עריכת תוכ
I would like to edit the page to edit a content
--------------------------------------
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      
++++++++++
048d1a6ab0d93bdb "灌装设备|封口设备|打码设备包装机|包装机械  
 包装机械  
 封口机 
 收缩机|热收缩机|热收缩包装机灌装机 
 灌装机|打码机|封口机     
 灌装机械 
 食品包装机械  
 灌装机|打码机|封口机|包装机械食品包装机械  
 打码机 
 防静电地板 
 包装设备 
 包装机械 
 写真机 
 包装机械  
 打码机      
 写真机 
 打码机  	  	  
 包装机械|包装设备|设备 
 包装机械  
 打码机

--------------------------------------
           
++++++++++
07ee5f40e36e4190 * Спасибо! -)
  Thank you    
--------------------------------------
         
++++++++++
08323f2f0a13c416 " 

  
 *   "
     
--------------------------------------
                                                                                       
++++++++++
096375408007d156 :Ε όχι ρε συ, έλεος! Δε μπορούν να μας το κάνουν αυτό. Μήπως μεταφέρθηκε στον Μπίθουλα;
  Oh no  mercy  They can not do this to us  Was he transferred to Bethoulah 
--------------------------------------
                                                                                         
++++++++++
0a5d45f269d20cf6 앉을때도 있다. 

 너무 불공정하다.... 

 좌석마다 행선지 적는 푯말이 있으면, 앉는 사람이 표기하고 

 서있는 사람들은 행선지 보고서 판단하면 좋지 않을까
Sometimes I sit  It s too unfair      If there are signposts for each seat  it would be a good idea to judge who seated the people sitting and who they are standing at 
--------------------------------------
                

--------------------------------------
                                                                                        
++++++++++
0becd3dca1a6e80d Да, стварно, и тоа едни од поактивните сме. Одлично, ќе имам од кого да поднаучам нешто.
Yes  indeed  and that s one of the more active ones  Great  I ll have someone to teach something 
--------------------------------------
                                         
++++++++++
0c542944418b65dd انا مغربي وافتخر بمغربيتي واحب ملكي ووطني
I am Moroccan and proud of my Moroccan and my royal and national love
--------------------------------------
                                                                                                         
++++++++++
0c94f513008dc6ec : Διόρθωσα λίγο ακόμη την πρόταση με την βοήθεια μιάς φίλης μου Αγγλίδας. Πιστεύω οτι τώρα είναι εντάξει.
  I just corrected the proposal a little with the help of my friend English  I think now it s okay 
--------------------------------------
                  

--------------------------------------
                                                                                     
++++++++++
125ff3bb419d6c69 مقام اول اسياي ميانه با تيم ملي اميد 
 دو دوره قهرماني جام شيخ راشد در دبي با تيم ملي
First place in Middle Ages with national team Hope of two wins of Sheikh Rashid Cup in Dubai with national team
--------------------------------------
                                                                                                                                                                                                                                                                                                                                                                                       
++++++++++
1272c25ed799c527 རང་གི་རིག་གནས་ཆུ་ཚད་ཡག་པོ་མེད་ཀྱང་། དྲ་རྒྱའི་རིག་གནས་འདི་ང་ཚོར་དགོས་གལ་ཆེ་བས། ང་ཝེ་ཁེ་བ་ཞིག་བྱེད་པར་སྤྲོ་བོ་ཡོད། ཡིན་ཡང་། གཅིག་ནས་ལག་གསར་ཡིན་པ་དང་། གཉིས་ནས་ཁྱེད་རང་གིས་བྱིན་པའི་ཡིག་ཆ་ཚང་མ་དབྱིན་ཇི་ཡིན་པས་བད

--------------------------------------
                                                       
++++++++++
13e579e382d771fe اميري يك پاكت سيگار وينستون دارم كه تمام رنگها را دارد.
Amiri has a Winston cigar pack that has all the colors 
--------------------------------------
                                                                                  
++++++++++
14d4a9ddc2f8a1f7 ::Για αναύξητο αόριστο δεν ξέρω, αλλά το έλυσα και το έπαθες παραμένουν αναλλοίωτα.
   I do not know for an indefinite change  but I have solved it and the ravages remain unchanged 
--------------------------------------
                                                                     
++++++++++
1567e275877b4041 Ашик паша Заде помшье име Билиш Кобила. За Уруца убица је био један [...]
Aishik pasha Zade gestures the name of Bilish Kobila  For Uroca the killer was one      
--------------------------------------
                                                                                              

--------------------------------------
                                 
++++++++++
15e0624afa6aa772 الخلیج الفارسی من الازل الی الابد
The Persian Gulf from eternity to eternity
--------------------------------------
                 
++++++++++
168a20612d874667 ::::::Βουρ στον πατσά!
       Vour to the stomach 
--------------------------------------
                                                                                                                                                                                                             
++++++++++
1692c26aa88e101c 发光棒 
 搅拌机  砂浆搅拌机 
 聚脲    
 聚氨酯喷涂  聚脲防腐   
 墙体保温材料 
 循环水真空泵 
 磁力搅拌器 
 旋转蒸发器 
 恒温干燥箱 
 干洗 
 干洗机 
 干洗设备 
 干洗加盟 
 干洗连锁 
 干洗店加盟 
 热收缩包装机 
 收缩包装机 
 收缩机 
 热收缩机 
 搬场 
 上海搬场 
 搬场公司 
 上海搬场公司 
 搬家 
 搬家公司 
 上海搬家 
 上海搬家公司
Glow Stick Mixer Mortar Mixer Polyurea Polyurethane Spray Polyurea Anticorrosion Wall Insulation Material Circulating Water Vacuum Pump Magnetic Stirrer Rotary Evaporator Thermostatic Dryer Dry Cleaning D

--------------------------------------
                       
++++++++++
1c641c3099a42ed4 Это пиздец. Это пиздец.
That s fucked up  That s fucked up 
--------------------------------------
                                                                                            
++++++++++
1d73453f67e0fa79 ::: Очень хорошо раз такое существо как ты понимает только язык санкций будем говорить на ЗКА.
    Very well times such a creature as you understand only the language of sanctions we will talk to ZKA 
--------------------------------------
         
++++++++++
1ee703ce84fb9a34 " 

    — ·  "
       
--------------------------------------
                                                                                                                                                                                                                 
++++++++++
1f12d92e63aab863 :::::::::::::::კობერ, ერთი ამას შეხედე რა. ერთი ჩრდილოელი ვანია ებრძვის კოლხეთის თემას. ქართული სამეფო არ იყოო 

--------------------------------------
                                      
++++++++++
21d8b7f5833006ea Κατάλογος βασιλέων της αρχαίας Σπάρτης
List of kings of ancient Sparta
--------------------------------------
                                                                                                                     
++++++++++
226464273c904491 Велимир Илић  

 Вратио сам Ваше претходне измјене. Нема потребе за тим. Могу се наћи прикладне слике. Поздрав.    с. р.
Velimir Ilic I returned your previous changes  No need for that  Suitable images can be found  Greeting  s  r 
--------------------------------------
                                                                         
++++++++++
234f74980cd507f4 ブロックについて  
 日本語版でも書きましたが、ブロックの理由を教えてください。あ、こちらも日本語版の方が分かりやすいのでよろしくお願いします。
I wrote about the block in the Japanese version  please tell me the reason of the block  Oh  thank you for your understanding as the Japanese version is easier to understand 
----------------

--------------------------------------
                                                                                                                                                                                                                                                                  
++++++++++
2e704370e973a98f Да, роден сум во Битола, но не живеам таму сега. Да јас ја ревидирав статијата за НОБ, мило ми е што ти се допаѓа. Сега е многу поинформативна и е збогатена со полезни фотографии, мапи и фактологија. Би сакал одвреме навреме да ја надгледуваш. Поздрав голем.
Yes  I was born in Bitola  but I do not live there now  Yes  I revised the article for the NOB  I m glad what you like  Now it is very informative and is enriched with useful photos  maps and factology  I would like to oversee you from time to time  Greeting big 
--------------------------------------
      
++++++++++
2ebb8aefc22c92f9 " 

 :~   "
      
--------------------------------------
                   

--------------------------------------
                                                                                                                                                            
++++++++++
366089836a89399b Όταν βλέπεις τη σελίδα συζήτησης ενός χρήστή, ή τη σελίδα χρήστη, κάτω αριστερά έχει ένα μενού, η τρίτη επιλογή είναι η συνεισφορά του συγκεκριμένου χρήστη.
When you see a user s chat page  or the user page  at the bottom left a menu  the third option is the contribution of that user 
--------------------------------------
                                                                                     
++++++++++
36c7277a1858e52e Политика конфиденциальностиОписание ВикипедииОтказ от ответственностиМобильная версия
Privacy PolicyDescriptionWikipediaLetter from liabilityMobile version
--------------------------------------
                                                   
++++++++++
36d4fe8de3fe0298 . 
 당신이 내 문서를 지시를 운전하려면 비가의 호수, 서스 캐처 원 알려 주시기 바랍니다
  If you w

--------------------------------------
                                                                                                                                                                                       
++++++++++
40632c5d935eace8 Ей, цингиби, на майка ти путката лайно осрано!!! Баща ти те е изпикал, а майка ти те е изсрала, да еба мангала ти прост да еба!!! Аре ходи се еби в гъза, а, чакай, ти вече го правиш това.
Hey  yeah  your mother s pussy shit  Your father has scorched you  and your mother has sucked you up  you fuck your bang  Ares walks into a ass  wait  you already do that 
--------------------------------------
                                               
++++++++++
40f2f3b2a87e6a81 بسم الله الرحمن الرحيم 
 اسلام عليكم ورحمت الله
In the name of Allah Most Gracious Most Merciful  Peace and Mercy be upon you
--------------------------------------
                                                                                                             

--------------------------------------
                                                                                                             
++++++++++
44dcbe80e5669d00 == Украинский футбол == 

 Давай лучше помогай писать статьи о укр. футболе на украинской или русской википедии.
   Ukrainian football    Let s better help write articles about ukr  football on the Ukrainian or Russian wikipedia 
--------------------------------------
                                                                                                                                                                                            
++++++++++
4513aa2dd89d0b06 שלום. אני אוהב את היהודים ואת כל עמי העולם. אני מקווה כי תוכל להתחיל להפסיק לכתוב שקרים על טניקים. שים לב הסרבים לאחר היהודים הרוסים והפולנים היו הנפגעים ביותר במלחמת העולם השנייה. כל טוב.
Peace  I love the Jews and all the peoples of the world  I hope that you can start to stop writing lies about tannicans  Note that the Serbs after t

--------------------------------------
                                    
++++++++++
4cbdb285a99075d2 경복궁 창덕궁 창경궁 경희궁 덕수궁 동방정교회 김구 장준하 여운형
Gyeongbok Palace Changdeok Palace Changgyeong Palace Gyeonghuigung Palace Deoksu Palace Orthodox Church
--------------------------------------
                                                                                                                                                         
++++++++++
4ce89b2f96c978a2 תקשיבו לי אתם חתיכת מופרעים תומכים במוות של גואן ריברס ועוד דברים החרות לא יכולה להיות המפלגה העיקרית בישראל מכיוון שהיא אף פעם לא הייתה חלק ממשלת ישראל.
Listen to me you are a bit of a misguided supporter of the death of Gwen Rivers and other things that freedom can not be the main party in Israel because it has never been part of the Israeli government 
--------------------------------------
                                                                                                                                         

--------------------------------------
                     
++++++++++
54369fa54be69277 دكتور محمد عبد السلام
Dr  Mohamed Abdel Salam
--------------------------------------
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  
++++++++++
546a6c12520ca8bb "خز معانی ز

--------------------------------------
                                                                                                                                                                                                                               
++++++++++
59a7f078577a3b74 :::::მუხრანელები არ არიან მერე პასპორტით მუხრანსკები? მაშინ სამივეს გარუსებული სახელი უნდა მივცეთ. იმერეტინსკებს, გრუზინსკებს და მუხრანსკებს. მე ამ დაყოფას მაინც არ ვეთანხმები და ჯობდა ერთად ყოფილიყო და დაყოფის აზრს ვერ ვხედავ.
      Are Mukhranians not after passports with injections  Then we have to give the three bad names  Immortine  gruzinziks and masks  I do not agree with this division and I would have been together and I do not see the idea of   dividing 
--------------------------------------
                                                                                                                              
++++++++++
5ab0708b43ece87f == Нужна звезда == 

 Здравствуйте! Очень нужна

--------------------------------------
                      
++++++++++
5c67d7a55958c554 خالد محمد أحمد المناري
Khalid Mohammed Ahmed Al Manary
--------------------------------------
                                                         
++++++++++
5c81f60a6307c55c Долой ментовское государство. 
 Долой Шерифофскую страну.
Down with the Mentovsky state  Down with the Sheriff country 
--------------------------------------
                           
++++++++++
5ce3338e24d2d089 ხელებს დაგამტვრევ შე ახვარო
Hands off my hands
--------------------------------------
                                                                                                                                                                                                                                                                                                                                                                                                                                             

--------------------------------------
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              
++++++++++
5f9c7610f0d83976 Ξέρετε αισθάνομαι λίγο περίεργα γιατί το θέμα, αν δείτε τις συζητήσεις ξεκίνησε από εμένα, όταν είδα τι παπατζιλίκια γράφανε οι τύποι.Ομολογώ πως δεν έχω απεριόριστες γνώσεις στο θέμα (ένα βιβλίο του Μπαμπινιώτη, κανα δυο αρθρα,κτ

--------------------------------------
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    
++++++++++
66d7647556eb4d69 :: Я не был никогда в Закарпатье, поэтому знаком с ситуацией только по сообщениям в прессе, в т.ч. и на русинских сайтах. Из чего сделал вывод, что число людей, счиатющих себя русинами, читающими русинские издания или входящими в русинские организации, даже по оценкам этих организаций находится примерно на уровне данны

--------------------------------------
                                                                                                                                                                              
++++++++++
6dd60ba628536ecb Ο καθηγητής της Φιλοσοφικής σχολής του Πανεπιστημίου της Αθήνας Δημήτρης Λιαντίνης υπάρχει στη γερμανική έκδοση της εγκυκλοπαίδειας. Δεν είναι φρόνιμο να μην υπάρχει και εδώ.
Professor of the Faculty of Philosophy of the University of Athens Dimitris Liantinis is in the German edition of the Encyclopedia  It is not wise not to be here 
--------------------------------------
                               
++++++++++
6e5493e56b3e08e0 السلام عليكم ورحمة الله وبركاته
Peace  mercy and blessings of God
--------------------------------------
                                                                                                                                                                                                                        

--------------------------------------
                                                                                                                                          
++++++++++
6f60af95fa052a2b يلعن كل جزائري يا ابن كلاب كس اختك يا حقير يلعن كل الجزائرين يا ابن الكلب يا حقير كس امك الشرمطة الجزائرين احقر ناس انا كرهة الجزائر بسببك
Curse every Algerian  the son of the dogs of your sister  O despicable  cursing all the Algerians  O son of the dog  you poor man  your mother s father  the crooks  the Algerians  the worst people  I hate Algeria because of you 
--------------------------------------
                    
++++++++++
6fbe17de0ebd1a5b это то, что я думаю.
this is what I think 
--------------------------------------
                                            
++++++++++
70cb62382969a1ea ::::Ты меня пугаешь. Что именно тебе нравится? -)
    You re scaring me  What exactly do you like    
--------------------------------------
                 
++++++++++
72299fc99e39b

--------------------------------------
                                                                                                                                                                                                                                   
++++++++++
76fb08af4f78d190 Ο χρηστης εχει απολυτο δικιο σχετικα με το ζειμπεκικο αν βρεις καποιες πληροφοριες και συ σχετικες με την αυθεντικοτητα καλο ειναι να τις αναρτησεις , αλλα οχι βασισμενες σε μονομερης αναφορες , αντικειμενικες ειναι προτιμοτερο ...
The user is absolutely right about the life if you find some information about the authenticity and it is good to post them  but not based on unilateral references  objective is preferable    
--------------------------------------
                                                                                                                                                                                                                                                 

--------------------------------------
                                                          
++++++++++
7e8ad3f83cb718ca == أمجد راضي == 

 أمجد راضي انتقل لنادي القوة الجوية العراقي
Amjad Radhi    Amjad Rady moved to the Iraqi Air Force Club
--------------------------------------
                                                                                                              
++++++++++
7e94d2b5b7bc282a ==잘 해 봅시다== 

 님...진실로 나라를 사랑한다면 육군사관학교로 합격 받으세요. 육사에서 만나 큰 뜻을 이룹시다. 님...꼭 육사를 생각 해보세요. 
 저도...육사를 드러가기를 최선을 다 해보겠습니다.
   Let s do it well    Sir     If you truly love your country  get accepted to the Military Academy  Let  s meet in the office and make a big deal  Mr      Please think about Mr  Sensei  I will try my best to show you 
--------------------------------------
                                                                                   
++++++++++
7eb7446115e3d36f 매일의 일상처럼 스치는 일들을 정리하여 최종적으로 책을 만드는 것이 목적이다. 
 그 책이 무슨 형태가 되든 무슨 모양으로 되든, 무슨 내용이 되든...

--------------------------------------
               
++++++++++
83f20ec791331d46 برو بینیم بابا.
Go see daddy
--------------------------------------
                      
++++++++++
844e10ddf98681e8 قريبا نتابع لكم الباقي
Soon we will follow you the rest
--------------------------------------
                                                                                                       
++++++++++
845b14154d38cece Έχει και κάτι άλλα μικρολαθάκια, ΜΗΝ τα διορθώσεις.  Θα τα χρησιμοποιήσουμε όταν πρέπει και όπως πρέπει.
It has something else  too  DO NOT make corrections  We will use them when they should and should 
--------------------------------------
                        
++++++++++
851255f3a67ef9c9 بدنا عربي يا اجانب قراوه
We are an Arab  foreigners  a coward
--------------------------------------
                                                                                                                                                             
++++++++++
857a1

--------------------------------------
                                                                                                             
++++++++++
8a0874b4a40e6f6f Е хвала што си ми рекао за ту црту коју нисам убацио. То ћу сада, а и истовремено направити оне друге измене.
Thank you for telling me about the line I did not put in  I will do it now and at the same time make those other changes 
--------------------------------------
                                                                                                                                                                                                                                                                                                                     
++++++++++
8a1d61efd14da0ae А наиболее безутешным пенсионерам, да и самым фанатичным из депутатов КПРФ, вроде Шандыбина, можно посоветовать, раз уж им не удалось героически погибнуть при охране тела любимого вождя, совершить публичные харакир

--------------------------------------
                                                                                                                
++++++++++
90fdf6e8e18f1946 Приветствую Вас снова. Загляните, пожалуйста, сюда. Надо бы выработать общую позицию по этому вопросу. Спасибо!   |
Greetings again  Look  please  here  We need to work out a common position on this issue  Thank you     
--------------------------------------
                                              
++++++++++
91509e7672bacfda ΥΓ Τι εγινε με τις φωτογραφιες απο την Εδεσσα?
What happened with the photos from Edessa 
--------------------------------------
                                                     
++++++++++
918866da7dc63238 אני הולך לישון עכשיו ומחר אני אהיה פה. 
 תסמכו על זה.
I m going to bed now and tomorrow I ll be here  Count on it 
--------------------------------------
                                               
++++++++++
91d4ea6a4aeb417d Συμφωνω απολυτα, οπως εξαλλου και οι σκοπιαν

--------------------------------------
                                                                                        
++++++++++
977f5f2a713a65eb == שוהם אמזלג == 

 כמה מלך אפשר להיות שוהם זה הדוגמא עובד בתור מתאבק ניצח את דל ריו פעמיים
   Shoham Amazleg    How Much King Can Be Shoham This example works as a wrestler who defeated Del Rio twice
--------------------------------------
                                            
++++++++++
97df9dd5f2b9942b Спасибо. Я вас совсем замучила моим балетом.
Thank you  I absolutely tortured you with my ballet 
--------------------------------------
                                                                                                                                                                                                                                                                                                                                                                                                          

--------------------------------------
                                                                             
++++++++++
9fea96fb811c62aa Вспоминая...== 

 Хороший, годный мультик. Жалко, что про него нет в РуВики.   

 ==
Remembering        Good  suitable cartoon  It s a pity that he is not in RuViki    
--------------------------------------
                                    
++++++++++
a28e7bab06d438fc شاليط والجنود المخطوفين في دار دارهم
Shalit and the abducted soldiers in their home
--------------------------------------
                                                              
++++++++++
a305a04fbdb7918b == отвечай на сообщения ты сын проститутки == 

 когда тебе пишут
   answer the messages you son of a prostitute    when you write


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [25]:
test.head()

Unnamed: 0,id,comment_text,comment_text1,str_len
0,00001cee341fdb12,yo bitch ja rule is more succesful then you wi...,Yo bitch Ja Rule is more succesful then you'll...,71
1,0000247867823ef7,from rfc title is fine as it is imo,== From RfC == \n\n The title is fine as it is...,9
2,00013b17ad220c46,sources zawe ashton lapland,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",4
3,00017563c3f7919a,if you have look back source information i u...,":If you have a look back at the source, the in...",32
4,00017695ad8997eb,i do not anonymously edit articles all,I don't anonymously edit articles at all.,7


In [26]:
test = test.drop(['comment_text1', 'str_len'], axis=1)

In [27]:
test.to_csv('../data/test_clean.csv', index=False)