In [6]:
# Easy data augmentation techniques for text classification
# Jason Wei and Kai Zou

import random
from random import shuffle
random.seed(1)

#stop words list
stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 
			'ours', 'ourselves', 'you', 'your', 'yours', 
			'yourself', 'yourselves', 'he', 'him', 'his', 
			'himself', 'she', 'her', 'hers', 'herself', 
			'it', 'its', 'itself', 'they', 'them', 'their', 
			'theirs', 'themselves', 'what', 'which', 'who', 
			'whom', 'this', 'that', 'these', 'those', 'am', 
			'is', 'are', 'was', 'were', 'be', 'been', 'being', 
			'have', 'has', 'had', 'having', 'do', 'does', 'did',
			'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',
			'because', 'as', 'until', 'while', 'of', 'at', 
			'by', 'for', 'with', 'about', 'against', 'between',
			'into', 'through', 'during', 'before', 'after', 
			'above', 'below', 'to', 'from', 'up', 'down', 'in',
			'out', 'on', 'off', 'over', 'under', 'again', 
			'further', 'then', 'once', 'here', 'there', 'when', 
			'where', 'why', 'how', 'all', 'any', 'both', 'each', 
			'few', 'more', 'most', 'other', 'some', 'such', 'no', 
			'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 
			'very', 's', 't', 'can', 'will', 'just', 'don', 
			'should', 'now', '']

#cleaning up text
import re
def get_only_chars(line):

    clean_line = ""

    line = line.replace("’", "")
    line = line.replace("'", "")
    line = line.replace("-", " ") #replace hyphens with spaces
    line = line.replace("\t", " ")
    line = line.replace("\n", " ")
    line = line.lower()

    for char in line:
        if char in 'qwertyuiopasdfghjklzxcvbnm ':
            clean_line += char
        else:
            clean_line += ' '

    clean_line = re.sub(' +',' ',clean_line) #delete extra spaces
    if clean_line[0] == ' ':
        clean_line = clean_line[1:]
    return clean_line

########################################################################
# Synonym replacement
# Replace n words in the sentence with synonyms from wordnet
########################################################################

#for the first time you use wordnet
#import nltk
#nltk.download('wordnet')
from nltk.corpus import wordnet 

def synonym_replacement(words, n):
	new_words = words.copy()
	random_word_list = list(set([word for word in words if word not in stop_words]))
	random.shuffle(random_word_list)
	num_replaced = 0
	for random_word in random_word_list:
		synonyms = get_synonyms(random_word)
		if len(synonyms) >= 1:
			synonym = random.choice(list(synonyms))
			new_words = [synonym if word == random_word else word for word in new_words]
			#print("replaced", random_word, "with", synonym)
			num_replaced += 1
		if num_replaced >= n: #only replace up to n words
			break

	#this is stupid but we need it, trust me
	sentence = ' '.join(new_words)
	new_words = sentence.split(' ')

	return new_words

def get_synonyms(word):
	synonyms = set()
	for syn in wordnet.synsets(word): 
		for l in syn.lemmas(): 
			synonym = l.name().replace("_", " ").replace("-", " ").lower()
			synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
			synonyms.add(synonym) 
	if word in synonyms:
		synonyms.remove(word)
	return list(synonyms)

########################################################################
# Random deletion
# Randomly delete words from the sentence with probability p
########################################################################

def random_deletion(words, p):

	#obviously, if there's only one word, don't delete it
	if len(words) == 1:
		return words

	#randomly delete words with probability p
	new_words = []
	for word in words:
		r = random.uniform(0, 1)
		if r > p:
			new_words.append(word)

	#if you end up deleting all words, just return a random word
	if len(new_words) == 0:
		rand_int = random.randint(0, len(words)-1)
		return [words[rand_int]]

	return new_words

########################################################################
# Random swap
# Randomly swap two words in the sentence n times
########################################################################

def random_swap(words, n):
	new_words = words.copy()
	for _ in range(n):
		new_words = swap_word(new_words)
	return new_words

def swap_word(new_words):
	random_idx_1 = random.randint(0, len(new_words)-1)
	random_idx_2 = random_idx_1
	counter = 0
	while random_idx_2 == random_idx_1:
		random_idx_2 = random.randint(0, len(new_words)-1)
		counter += 1
		if counter > 3:
			return new_words
	new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1] 
	return new_words

########################################################################
# Random insertion
# Randomly insert n words into the sentence
########################################################################

def random_insertion(words, n):
	new_words = words.copy()
	for _ in range(n):
		add_word(new_words)
	return new_words

def add_word(new_words):
	synonyms = []
	counter = 0
	while len(synonyms) < 1:
		random_word = new_words[random.randint(0, len(new_words)-1)]
		synonyms = get_synonyms(random_word)
		counter += 1
		if counter >= 10:
			return
	random_synonym = synonyms[0]
	random_idx = random.randint(0, len(new_words)-1)
	new_words.insert(random_idx, random_synonym)

########################################################################
# main data augmentation function
########################################################################

def eda(sentence, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.1, num_aug=9):
	
	sentence = get_only_chars(sentence)
	words = sentence.split(' ')
	words = [word for word in words if word is not '']
	num_words = len(words)
	
	augmented_sentences = []
	num_new_per_technique = int(num_aug/4)+1

	#sr
	if (alpha_sr > 0):
		n_sr = max(1, int(alpha_sr*num_words))
		for _ in range(num_new_per_technique):
			a_words = synonym_replacement(words, n_sr)
			augmented_sentences.append(' '.join(a_words))

	#ri
	if (alpha_ri > 0):
		n_ri = max(1, int(alpha_ri*num_words))
		for _ in range(num_new_per_technique):
			a_words = random_insertion(words, n_ri)
			augmented_sentences.append(' '.join(a_words))

	#rs
	if (alpha_rs > 0):
		n_rs = max(1, int(alpha_rs*num_words))
		for _ in range(num_new_per_technique):
			a_words = random_swap(words, n_rs)
			augmented_sentences.append(' '.join(a_words))

	#rd
	if (p_rd > 0):
		for _ in range(num_new_per_technique):
			a_words = random_deletion(words, p_rd)
			augmented_sentences.append(' '.join(a_words))

	augmented_sentences = [get_only_chars(sentence) for sentence in augmented_sentences]
	shuffle(augmented_sentences)

	#trim so that we have the desired number of augmented sentences
	if num_aug >= 1:
		augmented_sentences = augmented_sentences[:num_aug]
	else:
		keep_prob = num_aug / len(augmented_sentences)
		augmented_sentences = [s for s in augmented_sentences if random.uniform(0, 1) < keep_prob]

	#append the original sentence
	augmented_sentences.append(sentence)

	return augmented_sentences

  words = [word for word in words if word is not '']


In [50]:
import pandas as pd
data = pd.read_csv('personality_str.csv')

In [51]:
data = data.loc[data.Text.apply(lambda x: not isinstance(x, (float, int)))]

In [52]:
data.drop(['Unnamed: 0','Unnamed: 0.1','Unnamed: 0.2','Unnamed: 0.3','Unnamed: 0.4','Unnamed: 0.5'],axis=1,inplace = True)

In [53]:
data.index = range(len(data))

In [54]:
data.index = range(len(data))

In [55]:
data['Text'][0]

'edit not answer useless decide bring view sasuke uchiha seni middle axis tefi understandable avoid talk topic not change mind start dominant type not moralist morality link not weird dominant type betray dominant type not submit objective formulla type ifp sentiment inaccessible deeply subjective efficient not argument tefi seni efficiency objective factor subjective perception sasuke efficiency efficient proof not stupid people type identify person psychological type dominant function person jung dominant type book psychological type talk person focus purpose single person confuses vision introvert intuitive person reflect concrete reality intuitive meaning type dominant type sasuke seek meaning purpose immature not offend character not purpose immature tertiary function tertiary type not inside deeply dominant type properly immature sasuke immature purpose life sasuke kill itachi not relate dominant type event meaning sasuke seek revenge not highscale meaning life itachi victim deci

In [56]:
data

Unnamed: 0,Name,Type,Text
0,Sasuke Uchiha,ISFP,edit not answer useless decide bring view sasu...
1,Sasuke Uchiha,INTJ,conflict sasuke type frankly distinguish emoti...
2,Sasuke Uchiha,INTJ,not sasuke selforiented main goal kill brother...
3,Sasuke Uchiha,ISFP,sasuke childhood wound relates type type not l...
4,Sasuke Uchiha,INTJ,sasuke vengeful punitive sexual not lustful pr...
...,...,...,...
24216,Aisha (Lord Azoth),ENTJ,hard type inside ur brain tritype legitimately...
24217,Aisha / Layla,ESTP,dare vote flirty person love dancin sport acti...
24218,Aisha / Layla,ISTP,unpopular opinion honestly not classical total...
24219,Aisha / Layla,ESTP,obvious season true recall childhood lot traum...


In [57]:
data['Text']

0        edit not answer useless decide bring view sasu...
1        conflict sasuke type frankly distinguish emoti...
2        not sasuke selforiented main goal kill brother...
3        sasuke childhood wound relates type type not l...
4        sasuke vengeful punitive sexual not lustful pr...
                               ...                        
24216    hard type inside ur brain tritype legitimately...
24217    dare vote flirty person love dancin sport acti...
24218    unpopular opinion honestly not classical total...
24219    obvious season true recall childhood lot traum...
24220    dom obsess image strive emperor sacrifice love...
Name: Text, Length: 24221, dtype: object

In [58]:
q = []
for i in data['Text']:
    q.append(len(i.split()))

In [59]:
for i in range(len(q)):
    if q[i] <= 20:
        data.drop(i,inplace = True)

In [62]:
data

Unnamed: 0,Name,Type,Text
0,Sasuke Uchiha,ISFP,edit not answer useless decide bring view sasu...
1,Sasuke Uchiha,INTJ,conflict sasuke type frankly distinguish emoti...
2,Sasuke Uchiha,INTJ,not sasuke selforiented main goal kill brother...
3,Sasuke Uchiha,ISFP,sasuke childhood wound relates type type not l...
4,Sasuke Uchiha,INTJ,sasuke vengeful punitive sexual not lustful pr...
...,...,...,...
22999,Aisha (Lord Azoth),ENTJ,hard type inside ur brain tritype legitimately...
23000,Aisha / Layla,ESTP,dare vote flirty person love dancin sport acti...
23001,Aisha / Layla,ISTP,unpopular opinion honestly not classical total...
23002,Aisha / Layla,ESTP,obvious season true recall childhood lot traum...


In [61]:
data.index = range(len(data))

In [63]:
data.to_csv('lower_data.csv',encoding='utf-8')

In [64]:
q = []
for i in data['Text']:
    q.append(len(i))

for i in range(len(q)):
    if q[i] < 20:
        print(i,end = ' ')
        print(q[i])
data.index = range(len(data))

In [65]:
import pandas as pd
data = pd.read_csv('lower_data.csv')

In [66]:
data.drop('Unnamed: 0',axis=1,inplace=True)

In [22]:
data['IE/SN'] = data['IE']+data['SN']

In [23]:
text1 = data['Text']

In [24]:
data

Unnamed: 0,Name,Type,Text,IE,SN,FT,PJ,IE/SN
0,Sasuke Uchiha,ISFP,edit not answer useless decide bring view sasu...,I,S,F,P,IS
1,Sasuke Uchiha,INTJ,conflict sasuke type frankly distinguish emoti...,I,N,T,J,IN
2,Sasuke Uchiha,INTJ,not sasuke selforiented main goal kill brother...,I,N,T,J,IN
3,Sasuke Uchiha,ISFP,sasuke childhood wound relates type type not l...,I,S,F,P,IS
4,Sasuke Uchiha,INTJ,sasuke vengeful punitive sexual not lustful pr...,I,N,T,J,IN
...,...,...,...,...,...,...,...,...
22792,Aisha (Lord Azoth),ENTJ,hard type inside ur brainfor tritype legitimat...,E,N,T,J,EN
22793,Aisha / Layla,ESTP,dare voteshe flirty person love dancin sport a...,E,S,T,P,ES
22794,Aisha / Layla,ISTP,unpopular opinion honestly not classical total...,I,S,T,P,IS
22795,Aisha / Layla,ESTP,obvious season true recall childhood lot traum...,E,S,T,P,ES


In [67]:
import re
def cleantxt(raw):
	fil = re.compile(u'[^0-9a-zA-Z\u4e00-\u9fa5.，,。？“”]+', re.UNICODE)
	return fil.sub(' ', raw) 

In [68]:
text2 = data['Text'].apply(lambda x:cleantxt(x))

In [69]:
data['Text'] = text2

In [70]:
data

Unnamed: 0,Name,Type,Text
0,Sasuke Uchiha,ISFP,edit not answer useless decide bring view sasu...
1,Sasuke Uchiha,INTJ,conflict sasuke type frankly distinguish emoti...
2,Sasuke Uchiha,INTJ,not sasuke selforiented main goal kill brother...
3,Sasuke Uchiha,ISFP,sasuke childhood wound relates type type not l...
4,Sasuke Uchiha,INTJ,sasuke vengeful punitive sexual not lustful pr...
...,...,...,...
22999,Aisha (Lord Azoth),ENTJ,hard type inside ur brain tritype legitimately...
23000,Aisha / Layla,ESTP,dare vote flirty person love dancin sport acti...
23001,Aisha / Layla,ISTP,unpopular opinion honestly not classical total...
23002,Aisha / Layla,ESTP,obvious season true recall childhood lot traum...


In [71]:
IE = []
for i in data['Type']:
    if i in ['INTJ','INTP','INFJ','INFP','ISTJ','ISTP','ISFJ','ISFP']:
        IE.append('I')
    else:
        IE.append('E')

data['IE'] =IE

In [72]:
SN = []
for i in data['Type']:
    if i in ['INTJ','INTP','INFJ','INFP','ENTJ','ENTP','ENFJ','ENFP']:
        SN.append('N')
    else:
        SN.append('S')
data['SN'] = SN

In [73]:
FT = []
for i in data['Type']:
    if i in ['INTJ','INTP','ISTJ','ISTP','ENTJ','ENTP','ESTJ','ESTP']:
        FT.append('T')
    else:
        FT.append('F')
data['FT'] = FT

In [74]:
PJ = []
for i in data['Type']:
    if i in ['INTP','INFP','ISTP','ISFP','ENTP','ENFP','ESTP','ESFP']:
        PJ.append('P')
    else:
        PJ.append('J')
data['PJ'] = PJ

In [75]:
data

Unnamed: 0,Name,Type,Text,IE,SN,FT,PJ
0,Sasuke Uchiha,ISFP,edit not answer useless decide bring view sasu...,I,S,F,P
1,Sasuke Uchiha,INTJ,conflict sasuke type frankly distinguish emoti...,I,N,T,J
2,Sasuke Uchiha,INTJ,not sasuke selforiented main goal kill brother...,I,N,T,J
3,Sasuke Uchiha,ISFP,sasuke childhood wound relates type type not l...,I,S,F,P
4,Sasuke Uchiha,INTJ,sasuke vengeful punitive sexual not lustful pr...,I,N,T,J
...,...,...,...,...,...,...,...
22999,Aisha (Lord Azoth),ENTJ,hard type inside ur brain tritype legitimately...,E,N,T,J
23000,Aisha / Layla,ESTP,dare vote flirty person love dancin sport acti...,E,S,T,P
23001,Aisha / Layla,ISTP,unpopular opinion honestly not classical total...,I,S,T,P
23002,Aisha / Layla,ESTP,obvious season true recall childhood lot traum...,E,S,T,P


In [76]:
data.to_csv('lower_data.csv')

In [6]:
>>> from enchant.checker import SpellChecker
>>> chkr = SpellChecker("en_US")
>>> chkr.set_text("This is sme sample txt with erors.")
>>> for err in chkr:
        print ("ERROR", err.word)
        sug = err.suggest()[0]
        print(sug)
        

ERROR sme
time
ERROR txt
text
ERROR erors
errs


In [1]:
import enchant
import wx
from enchant.checker import SpellChecker
from enchant.checker.wxSpellCheckerDialog import wxSpellCheckerDialog
from enchant.checker.CmdLineChecker import CmdLineChecker

chkr = enchant.checker.SpellChecker('en_US')
q = []

for i in data['Text']:
    chkr.set_text(i)
    for err in chkr:
        try:
            sug = err.suggest()[0]
        except IndexError:
            
            continue    
        err.replace(sug)
    c = chkr.get_text()#returns corrected text
    q.append(c)



NameError: name 'data' is not defined

In [104]:
len(q)

22797

In [105]:
len(data['Text'])

22797

In [106]:
data['Text'] = q

In [109]:
data.to_csv('spell_check_data.csv')

In [110]:
data

Unnamed: 0,Name,Type,Text,IE,SN,FT,PJ,IE/SN
0,Sasuke Uchiha,ISFP,edit not answer useless decide bring view saps...,I,S,F,P,IS
1,Sasuke Uchiha,INTJ,conflict sapsucker type frankly distinguish em...,I,N,T,J,IN
2,Sasuke Uchiha,INTJ,not sapsucker self oriented main goal kill bro...,I,N,T,J,IN
3,Sasuke Uchiha,ISFP,sapsucker childhood wound relates type type no...,I,S,F,P,IS
4,Sasuke Uchiha,INTJ,sapsucker vengeful punitive sexual not lustful...,I,N,T,J,IN
...,...,...,...,...,...,...,...,...
22792,Aisha (Lord Azoth),ENTJ,hard type inside Ur brain for varitype legitim...,E,N,T,J,EN
22793,Aisha / Layla,ESTP,dare vote she flirty person love dancing sport...,E,S,T,P,ES
22794,Aisha / Layla,ISTP,unpopular opinion honestly not classical total...,I,S,T,P,IS
22795,Aisha / Layla,ESTP,obvious season true recall childhood lot traum...,E,S,T,P,ES


In [111]:
data = pd.read_csv('spell_check_data.csv')

In [113]:
data['IE']

0        I
1        I
2        I
3        I
4        I
        ..
22792    E
22793    E
22794    I
22795    E
22796    E
Name: IE, Length: 22797, dtype: object