### Data Preparation Outcomes:
Varaibles created and their description

* word_token_nltk: Raw data word tokens (Punctuations are separate tokens)
* sent_token: Rarw data sentence tokens
* word_token_manual: Raw data word tokens( Punctuations are not tokens)
* clean_text: Text data with no stop words and punctuations
* lemm_text: Clean text with lemmatization
* freq_dist: Dictionary of lemm words with their frequency counts
* most_common_words: List of tuple of 3 most frequently occuring lemmatized text words with their count
* distinct_words_cnt: count of distinct lemmatized text words


#### Libraries

In [None]:
#pip install nltk

In [2]:
import pandas as pd
import numpy as np
pd.set_option('display.expand_frame_repr', False)
# import nltk
# nltk.download('all')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk.stem import WordNetLemmatizer


#### Data Ingestion

In [3]:
data = pd.read_csv('train_data/train.csv')
data.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3911 entries, 0 to 3910
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   text_id      3911 non-null   object 
 1   full_text    3911 non-null   object 
 2   cohesion     3911 non-null   float64
 3   syntax       3911 non-null   float64
 4   vocabulary   3911 non-null   float64
 5   phraseology  3911 non-null   float64
 6   grammar      3911 non-null   float64
 7   conventions  3911 non-null   float64
dtypes: float64(6), object(2)
memory usage: 244.6+ KB


#### Tokenization

In [5]:
data['word_token_nltk'] = data['full_text'].apply(lambda x: word_tokenize(x)) #Tokenize each words and punctuations
data['sent_token'] = data['full_text'].apply(lambda x: sent_tokenize(x))# Tokenize sentences

In [6]:
# Splitting using space should preserve context in terms of use of punctuation
data['word_token_manual'] =data['full_text'].apply(lambda x: x.split(' '))
print(data['word_token_manual'][0])

['I', 'think', 'that', 'students', 'would', 'benefit', 'from', 'learning', 'at', 'home,because', 'they', 'wont', 'have', 'to', 'change', 'and', 'get', 'up', 'early', 'in', 'the', 'morning', 'to', 'shower', 'and', 'do', 'there', 'hair.', 'taking', 'only', 'classes', 'helps', 'them', 'because', 'at', 'there', 'house', "they'll", 'be', 'pay', 'more', 'attention.', 'they', 'will', 'be', 'comfortable', 'at', 'home.\n\nThe', 'hardest', 'part', 'of', 'school', 'is', 'getting', 'ready.', 'you', 'wake', 'up', 'go', 'brush', 'your', 'teeth', 'and', 'go', 'to', 'your', 'closet', 'and', 'look', 'at', 'your', 'cloths.', 'after', 'you', 'think', 'you', 'picked', 'a', 'outfit', 'u', 'go', 'look', 'in', 'the', 'mirror', 'and', 'youll', 'either', 'not', 'like', 'it', 'or', 'you', 'look', 'and', 'see', 'a', 'stain.', 'Then', "you'll", 'have', 'to', 'change.', 'with', 'the', 'online', 'classes', 'you', 'can', 'wear', 'anything', 'and', 'stay', 'home', 'and', 'you', 'wont', 'need', 'to', 'stress', 'about'

In [7]:
print(data['word_token_nltk'][0])#Sample
print(data['sent_token'][0])#Sample

['I', 'think', 'that', 'students', 'would', 'benefit', 'from', 'learning', 'at', 'home', ',', 'because', 'they', 'wont', 'have', 'to', 'change', 'and', 'get', 'up', 'early', 'in', 'the', 'morning', 'to', 'shower', 'and', 'do', 'there', 'hair', '.', 'taking', 'only', 'classes', 'helps', 'them', 'because', 'at', 'there', 'house', 'they', "'ll", 'be', 'pay', 'more', 'attention', '.', 'they', 'will', 'be', 'comfortable', 'at', 'home', '.', 'The', 'hardest', 'part', 'of', 'school', 'is', 'getting', 'ready', '.', 'you', 'wake', 'up', 'go', 'brush', 'your', 'teeth', 'and', 'go', 'to', 'your', 'closet', 'and', 'look', 'at', 'your', 'cloths', '.', 'after', 'you', 'think', 'you', 'picked', 'a', 'outfit', 'u', 'go', 'look', 'in', 'the', 'mirror', 'and', 'youll', 'either', 'not', 'like', 'it', 'or', 'you', 'look', 'and', 'see', 'a', 'stain', '.', 'Then', 'you', "'ll", 'have', 'to', 'change', '.', 'with', 'the', 'online', 'classes', 'you', 'can', 'wear', 'anything', 'and', 'stay', 'home', 'and', 'y

#### tokenization and features on clean data(Without stop words and punctuations)

In [8]:
#Creating a stopword free full text column for analysis; Punctuations need to be removed separately
stopword_list = stopwords.words("english")
data['clean_text'] = data['full_text'].apply(lambda x: [i.lower() for i in word_tokenize(x) if i.lower() not in stopword_list])
data['clean_text'] = data['clean_text'].apply(lambda x: [w for w in x if w.isalpha()])
# Lemmatizing the tokens to convert them to contextual root words
wordnet_lemmatizer = WordNetLemmatizer()
data['lemm_text'] = data['clean_text'].apply(lambda x: [wordnet_lemmatizer.lemmatize(w) for w in x])
print(data['lemm_text'][0])#Sample

['think', 'student', 'would', 'benefit', 'learning', 'home', 'wont', 'change', 'get', 'early', 'morning', 'shower', 'hair', 'taking', 'class', 'help', 'house', 'pay', 'attention', 'comfortable', 'home', 'hardest', 'part', 'school', 'getting', 'ready', 'wake', 'go', 'brush', 'teeth', 'go', 'closet', 'look', 'cloth', 'think', 'picked', 'outfit', 'u', 'go', 'look', 'mirror', 'youll', 'either', 'like', 'look', 'see', 'stain', 'change', 'online', 'class', 'wear', 'anything', 'stay', 'home', 'wont', 'need', 'stress', 'wear', 'student', 'usually', 'take', 'shower', 'school', 'either', 'take', 'sleep', 'wake', 'student', 'smell', 'good', 'cause', 'miss', 'bus', 'effect', 'lesson', 'time', 'cause', 'come', 'late', 'school', 'u', 'online', 'class', 'u', 'wont', 'need', 'miss', 'lesson', 'cause', 'get', 'everything', 'set', 'go', 'take', 'shower', 'u', 'get', 'ready', 'go', 'home', 'comfortable', 'pay', 'attention', 'give', 'advantage', 'smarter', 'even', 'pas', 'classmate', 'class', 'work', 'pub

In [9]:
data['freq_dist'] = data['lemm_text'].apply(lambda x: FreqDist(x))
data['freq_dist'][0]

FreqDist({'student': 5, 'class': 5, 'go': 5, 'home': 4, 'school': 4, 'u': 4, 'cause': 4, 'wont': 3, 'get': 3, 'shower': 3, ...})

In [10]:
print(data['freq_dist'][0])

<FreqDist with 81 samples and 129 outcomes>


In [11]:
# List of 3 most commonly used words with their count
data['most_common_words'] = data['freq_dist'].apply(lambda x: x.most_common(3))
data['most_common_words'][0]

[('student', 5), ('class', 5), ('go', 5)]

In [12]:
# Distinct root words in the clean data
data['distinct_words_cnt'] = data['freq_dist'].apply(lambda x: len(x.keys()))
data['distinct_words_cnt'][0]

81

In [13]:
# Writing Data to csv
data.to_csv('train_tokenized.csv')