In [68]:
# Load libraries
import pandas as pd

In [69]:
# Load Train dataset
train_data = pd.read_csv('train.csv')
train_data.head()

Unnamed: 0,ID,A1,A2,A3,A4,A5,Score
0,14103__worth_9__free_5,1,1,1,0,0,0.369248
1,4603__fretted_10__mother_16,3,3,2,2,0,1.436513
2,3706__attitudes_14__lax_12,3,3,2,1,0,0.994271
3,3098__settlement_14__achieve_12,2,1,1,0,0,0.378663
4,716__mask_30__stripped_28,2,2,2,1,1,0.722123


In [70]:
train_data['ID'].head()

0             14103__worth_9__free_5
1        4603__fretted_10__mother_16
2         3706__attitudes_14__lax_12
3    3098__settlement_14__achieve_12
4          716__mask_30__stripped_28
Name: ID, dtype: object

In [71]:
#Extract the ID from the column (Regular Expression)
train_data['sentence_id'] = train_data.ID.str.extract(pat = '(^\d*)')
train_data['sentence_id'] = train_data['sentence_id'].astype(int)
train_data['sentence_id'].head()

0    14103
1     4603
2     3706
3     3098
4      716
Name: sentence_id, dtype: int32

In [72]:
#Extract the word from the column (Regular Expression)
train_data['word'] = train_data.ID.str.extract(pat = '([A-Za-z]+)')
train_data['word'].head()

0         worth
1       fretted
2     attitudes
3    settlement
4          mask
Name: word, dtype: object

In [73]:
#Extract the metaphor from the column (Regular Expression)
train_data['metaphor'] = train_data.ID.str.extract(pat = '([A-Za-z]+_\d+$)')
train_data['metaphor'] = train_data.metaphor.str.extract(pat = '([A-Za-z]+)')
train_data['metaphor'].head()

0        free
1      mother
2         lax
3     achieve
4    stripped
Name: metaphor, dtype: object

In [74]:
train_data['Score'].head()

0    0.369248
1    1.436513
2    0.994271
3    0.378663
4    0.722123
Name: Score, dtype: float64

In [75]:
train = train_data[['sentence_id','word', 'metaphor', 'Score']]
train.head()

Unnamed: 0,sentence_id,word,metaphor,Score
0,14103,worth,free,0.369248
1,4603,fretted,mother,1.436513
2,3706,attitudes,lax,0.994271
3,3098,settlement,achieve,0.378663
4,716,mask,stripped,0.722123


In [76]:
# Load corpus dataset
corpus_data = pd.read_csv('vuamc_corpus.csv')
corpus_data.head()

Unnamed: 0,sentence_id,fragment_id,within_fragment_sentence_id,sentence_txt
0,,,,
1,0.0,a1e-fragment01,1.0,Latest corporate unbundler M_reveals laid-back...
2,,,,
3,1.0,a1e-fragment01,2.0,By FRANK KANE
4,,,,


In [77]:
corpus_data = corpus_data[['sentence_id','sentence_txt']]
corpus_data = corpus_data.dropna()
corpus_data.head()

Unnamed: 0,sentence_id,sentence_txt
1,0.0,Latest corporate unbundler M_reveals laid-back...
3,1.0,By FRANK KANE
5,2.0,"IT SEEMS that Roland Franklin , the latest unb..."
7,3.0,He has not properly investigated the M_target ...
9,4.0,The 63-year-old M_head of Pembridge Investment...


In [78]:
#pd.to_numeric(corpus_data['sentence_id'], downcast='signed')
corpus_data['sentence_id'] = corpus_data['sentence_id'].astype(int)
corpus_data.head()

Unnamed: 0,sentence_id,sentence_txt
1,0,Latest corporate unbundler M_reveals laid-back...
3,1,By FRANK KANE
5,2,"IT SEEMS that Roland Franklin , the latest unb..."
7,3,He has not properly investigated the M_target ...
9,4,The 63-year-old M_head of Pembridge Investment...


In [79]:
corpus_data['sentence_txt'] = corpus_data['sentence_txt'].str.replace('M_', '')
corpus_data['sentence_txt'] = corpus_data['sentence_txt'].str.replace('-', ' ')
corpus_data['sentence_txt'].head()

1    Latest corporate unbundler reveals laid back a...
3                                        By FRANK KANE
5    IT SEEMS that Roland Franklin , the latest unb...
7    He has not properly investigated the target 's...
9    The 63 year old head of Pembridge Investments ...
Name: sentence_txt, dtype: object

In [80]:
corpus_data['sentence_txt'] = corpus_data['sentence_txt'].str.encode("ascii", "replace")
corpus_data['sentence_txt'] = corpus_data['sentence_txt'].str.decode("utf-8", "ignore")
corpus_data['sentence_txt'].head()

1    Latest corporate unbundler reveals laid back a...
3                                        By FRANK KANE
5    IT SEEMS that Roland Franklin , the latest unb...
7    He has not properly investigated the target 's...
9    The 63 year old head of Pembridge Investments ...
Name: sentence_txt, dtype: object

In [81]:
corpus_data['sentence_txt'] = corpus_data['sentence_txt'].str.replace("?", " ")
#corpus_data['sentence_txt'] = corpus_data['sentence_txt'].str.replace('M', ' ')
corpus_data.head()

Unnamed: 0,sentence_id,sentence_txt
1,0,Latest corporate unbundler reveals laid back a...
3,1,By FRANK KANE
5,2,"IT SEEMS that Roland Franklin , the latest unb..."
7,3,He has not properly investigated the target 's...
9,4,The 63 year old head of Pembridge Investments ...


In [82]:
cols = ['sentence_id']
train_visulization_data = train.join(corpus_data.set_index(cols), on=cols)
train_visulization_data

Unnamed: 0,sentence_id,word,metaphor,Score,sentence_txt
0,14103,worth,free,0.369248,So I get a free twenty five pound worth of Mar...
1,4603,fretted,mother,1.436513,In the fitting rooms at Taylors she fussed and...
2,3706,attitudes,lax,0.994271,Studies in this area have indicated that too s...
3,3098,settlement,achieve,0.378663,It plans to issue a protective writ but is hop...
4,716,mask,stripped,0.722123,A series of spoof or obsessively realistic 196...
5,2692,time,round,0.676453,The second time round they dealt with R. and B...
6,5410,stretched,ahead,0.702680,"To her , the long summer days had stretched ah..."
7,5595,shape,disguise,0.586667,The seemingly random and jagged edged pattern ...
8,8227,dramatic,warbling,1.999753,"In the high ceilinged kitchen , surrounded by ..."
9,8147,fat,stale,0.655528,"Then , the winter salon did not smell of lilie..."


In [83]:
train_visulization_data.to_csv("train_visualization_data.csv")