# Loading and checking data

In [1]:
import pandas as pd

In [2]:
#reading in data 
train = pd.read_csv('mine_raw.csv')

In [3]:
#data checking
train['word_count'] = train['OriginalShorttext'].apply(lambda x: len(str(x).split(" ")))
train[['OriginalShorttext','word_count']].head()

Unnamed: 0,OriginalShorttext,word_count
0,BUCKET WON'T OPEN,3
1,L/H BUCKET CYL LEAKING.,4
2,SWAP BUCKET,2
3,FIT BUCKET TOOTH,3
4,REFIT BUCKET TOOTH,3


In [4]:
train['char_count'] = train['OriginalShorttext'].str.len() ## this also includes spaces
train[['OriginalShorttext','char_count']].head()

Unnamed: 0,OriginalShorttext,char_count
0,BUCKET WON'T OPEN,17
1,L/H BUCKET CYL LEAKING.,23
2,SWAP BUCKET,11
3,FIT BUCKET TOOTH,16
4,REFIT BUCKET TOOTH,18


In [5]:
def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))

In [6]:
train['avg_word'] = train['OriginalShorttext'].apply(lambda x: avg_word(x))
train[['OriginalShorttext','avg_word']].head()

Unnamed: 0,OriginalShorttext,avg_word
0,BUCKET WON'T OPEN,5.0
1,L/H BUCKET CYL LEAKING.,5.0
2,SWAP BUCKET,5.0
3,FIT BUCKET TOOTH,4.666667
4,REFIT BUCKET TOOTH,5.333333


## Pre-proccessing data

In [7]:
#removing capitalization
train['OriginalShorttext'] = train['OriginalShorttext'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train['OriginalShorttext'].head

<bound method NDFrame.head of 0                              bucket won't open
1                        l/h bucket cyl leaking.
2                                    swap bucket
3                               fit bucket tooth
4                             refit bucket tooth
5                   repair loose bucket cylinder
6                         replace 2 bucket teeth
7                       replace missing lip tips
8                                    replace lip
9                                 rebuild lip #3
10                         replace bucket teeth.
11                       repair rh120 lip at p&h
12                           replace missing tip
13                 replace missing tip& adaptor.
14             replace centre and lh lip shrouds
15                        oil leak on bucket....
16                          replace adaptor lock
17                              fit a lip shroud
18                         refit left lip shroud
19                       refit adapter 

In [8]:
#remove punctuation
train['OriginalShorttext'] = train['OriginalShorttext'].str.replace('[^\w\s]','')
train['OriginalShorttext'].head()

0         bucket wont open
1    lh bucket cyl leaking
2              swap bucket
3         fit bucket tooth
4       refit bucket tooth
Name: OriginalShorttext, dtype: object

In [9]:
#remove stop words
from nltk.corpus import stopwords
stop = stopwords.words('english')

train['stopwords'] = train['OriginalShorttext'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
train['OriginalShorttext'].head()

0         bucket wont open
1    lh bucket cyl leaking
2              swap bucket
3         fit bucket tooth
4       refit bucket tooth
Name: OriginalShorttext, dtype: object

In [10]:
#remove all of the words that only appear once
freq = pd.Series(' '.join(train['OriginalShorttext']).split()).value_counts()[-974:]
freq = list(freq.index)
train['OriginalShorttext'] = train['OriginalShorttext'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['OriginalShorttext'].head()

0         bucket wont open
1    lh bucket cyl leaking
2                   bucket
3         fit bucket tooth
4       refit bucket tooth
Name: OriginalShorttext, dtype: object

In [11]:
#stemming the words
from nltk.stem import PorterStemmer
st = PorterStemmer()
train['OriginalShorttext'][:5].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

0      bucket wont open
1    lh bucket cyl leak
2                bucket
3      fit bucket tooth
4    refit bucket tooth
Name: OriginalShorttext, dtype: object

## Word Embeddings 
Preparing the data into a matrix so I can train a neural net on it

In [12]:
from textblob import TextBlob

In [37]:
trigrams = [(train['OriginalShorttext'][i]).split()
    for i in range(len(train['OriginalShorttext']))]
print(trigrams)



In [42]:
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(max_features=1000, lowercase=True, ngram_range=(1,1),analyzer = "word")
train_bow = bow.fit_transform(train['OriginalShorttext'])
train_bow

<5485x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 24089 stored elements in Compressed Sparse Row format>