# Notebook imports

In [65]:
from os import walk
from os.path import join

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import nltk
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from bs4 import BeautifulSoup
from wordcloud import WordCloud
from PIL import Image

from sklearn.model_selection import train_test_split

%matplotlib inline

# Constants

In [2]:
EXAMPLE_FILE = 'SpamData/01_Processing/practice_email.txt'
SPAM_1_PATH = 'SpamData/01_Processing/spam_assassin_corpus/spam_1/'
SPAM_2_PATH = 'SpamData/01_Processing/spam_assassin_corpus/spam_2/'
NON_SPAM_1_PATH = 'SpamData/01_Processing/spam_assassin_corpus/easy_ham_1/'
NON_SPAM_2_PATH = 'SpamData/01_Processing/spam_assassin_corpus/easy_ham_2/'
SPAM = 1
NON_SPAM = 0

DATA_PATH = 'email-data.json'
TRAINING_DATA_FILE = 'data-grouped/train-data.txt'
TEST_DATA_FILE = 'data-grouped/test-data.txt'

VOCAB_SIZE = 2500

WHALE_FILE = 'SpamData/01_Processing/wordcloud_resources/whale-icon.png'
THUMBS_UP_FILE = 'SpamData/01_Processing/wordcloud_resources/thumbs-up.png'
THUMBS_DOWN_FILE = 'SpamData/01_Processing/wordcloud_resources/thumbs-down.png'
FONT_PATH = 'SpamData/01_Processing/wordcloud_resources/OpenSansCondensed-Bold.ttf'

# Reading Files

In [None]:
file = open(EXAMPLE_FILE, encoding='latin-1')
is_body = False
lines = []
for line in file:
    if is_body:
        lines.append(line)
    elif line == '\n':
        is_body = True
file.close()

email_body = '\n'.join(lines)

# Generator Functions

In [None]:
def generate_square(n):
    for number in range(n):
        yield number ** 2

generate_square(10)

In [None]:
for i in generate_square(5):
    print(i, end=' ')

# สาเหตุที่ต้องใช้ก็คือมันมีการจำ state และเมื่อเราเล่นกับ dataset ใหญ่ๆ
# ลองนึกภาพว่าคำนวณก้อนเท่าบ้านออกมาอะ ขนาดข้อมูลนะ ใช้เวลานานมากด้วยกว่าจะรัน state ถัดไปได้

# Email body extraction

In [None]:
def email_body_generator(path):
    for root, dir_names, file_names in walk(path):
        for file_name in file_names:
            file = open(join(root, file_name), encoding='latin-1')
            is_body = False
            lines = []
            for line in file:
                if is_body:
                    lines.append(line)
                elif line == '\n':
                    is_body = True
            file.close()
            email_body = '\n'.join(lines)
            yield file_name, email_body 

In [None]:
def get_df_from_directory(path, classification):
    rows = []
    row_names = []
    for file_name, email_body in email_body_generator(path):
        rows.append({'MESSAGE': email_body, 'CATEGORY': classification})
        row_names.append(file_name)
    return pd.DataFrame(rows, index=row_names)

In [None]:
# if you dont have data-email.csv file
df = get_df_from_directory(SPAM_1_PATH, SPAM)
df = df.append(get_df_from_directory(SPAM_2_PATH, SPAM))
df = df.append(get_df_from_directory(NON_SPAM_1_PATH, NON_SPAM))
df = df.append(get_df_from_directory(NON_SPAM_2_PATH, NON_SPAM))
df

In [4]:
# else:  $read it
# can skip to explore and visualisation
df = pd.read_csv(DATA_PATH.replace('json', 'csv'))
df = df.set_index('id')
df

Unnamed: 0_level_0,MESSAGE,CATEGORY,file_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr...",1,00001.7848dde101aa985090474a91ec93fcf0
1,1) Fight The Risk of Cancer!\n\nhttp://www.adc...,1,00002.d94f1b97e48ed3b553b3508d116e6a09
2,1) Fight The Risk of Cancer!\n\nhttp://www.adc...,1,00003.2ee33bc6eacdb11f38d052c44819ba6c
3,##############################################...,1,00004.eac8de8d759b7e74154f142194282724
4,I thought you might like these:\n\n1) Slim Dow...,1,00005.57696a39d7d84318ce497886896bf90d
...,...,...,...
5791,http://news.bbc.co.uk/1/hi/england/2515127.stm...,0,01396.61983fbe6ec43f55fd44e30fce24ffa6
5792,"> >-- be careful when using this one.) Also, t...",0,01397.9f9ef4c2a8dc012d80f2ce2d3473d3b7
5793,">>>>> ""SM"" == Skip Montanaro <skip@pobox.com> ...",0,01398.169b51731fe569f42169ae8f948ec676
5794,"So then, ""Mark Hammond"" <mhammond@skippinet.co...",0,01399.ca6b00b7b341bbde9a9ea3dd6a7bf896


# Data cleaning: Checking for Missing Values

In [None]:
# check for missing valeus
print(df['MESSAGE'].isnull().values.any(), '\n')
print(df['MESSAGE'].isnull().value_counts())

In [None]:
# check for empty email
print((df['MESSAGE'].str.len() == 0).values.any(), '\n')
print((df['MESSAGE'].str.len() == 0).value_counts())

In [None]:
# locate emptry email
print(df['MESSAGE'].str.len()==0)
df[df['MESSAGE'].str.len()==0].index  # get the index, name which match given condition

In [None]:
#df.index.get_loc('cmds')  # return narray type: boolean
#df[df.index.get_loc('cmds')]
#df.index

a = df[df['MESSAGE'].str.len()==0].index
df.loc[['cmds']]

## Remove System file Entries from DataFrame

In [None]:
df.drop(['cmds'], inplace=True)  # inplace make chahge with df, drop with index of list of index's name

In [None]:
df.shape

In [None]:
doc_id = range(0, len(df.index))
df['id'] = doc_id
df['file_name'] = df.index
df = df.set_index('id')
df

In [None]:
# save to file using pandas
df.to_json(DATA_PATH)
df.to_csv(DATA_PATH.replace('json', 'csv'))

# Explore & Data visualisation

In [None]:
df['CATEGORY'].value_counts()
non_spam_amount = df['CATEGORY'].value_counts()[0]
spam_amount = df['CATEGORY'].value_counts()[1]

In [None]:
sizes = [spam_amount, non_spam_amount]
categ_names = ['Spam', 'Non-Spam']
plt.figure(figsize=[8, 8])
plt.pie(sizes, labels=categ_names, startangle=90, textprops={'fontsize': 16}, autopct='%1.0f%%',
        explode=[0, 0.1])
plt.show()

In [None]:
plt.figure(figsize=[8, 8])
plt.pie(sizes, labels=categ_names, startangle=90, textprops={'fontsize': 16}, autopct='%1.0f%%',
        pctdistance=0.8)
plt.gca().add_artist(plt.Circle((0, 0), radius=0.6, fc='white'))
plt.show()

# Natural Language Processing

## pre-processing 

### Convert to Lower Case

In [None]:
word = 'Prayut is bad soldier. That\'s True'
word.lower()

### Download the NLTK Resources (Tokenizer & Stopwords)

In [None]:
nltk.download('punkt')

### Tokenising

In [None]:
word_tokenize(word.lower())  # like split.(' ')

### Stop words  & Stemming

In [None]:
nltk.download('stopwords')

In [None]:
# example nltk resource for data visualisation
nltk.download('gutenberg')
nltk.download('shakespeare')

In [None]:
stop_words = stopwords.words('english')

In [None]:
word = 'Prayut do and did Bad. That\'s True. To be or not to be. He should not have done. Inovation'
words = word_tokenize(word.lower())
filterd_words = []
stemmer = PorterStemmer()
#stemmer = SnowballStemmer('english')

for word in words:
    if word not in stop_words and word.isalpha():
        stemmed_word = stemmer.stem(word)
        filterd_words.append(stemmed_word)
    
filterd_words

In [None]:
print('>'.isalpha())
print('Ps'.isalpha())

### Removing HTML tag in email body

In [None]:
# at[[row], [column name]]
soup = BeautifulSoup(df.at[2, 'MESSAGE'], 'html.parser')
print(soup.prettify())
print(soup.get_text())

In [5]:
def get_clean_message(message, stemmer=PorterStemmer(), stop_words=set(stopwords.words('english'))):
    message = BeautifulSoup(message, 'html.parser').get_text().lower()  # remove html tag and set to lowwer
    words = word_tokenize(message)  # split the word
    filtered_words = []
    
    for word in words:
        # remove stop word and punctuation
        if word not in stop_words and word.isalpha():
            filtered_words.append(stemmer.stem(word))
    
    return filtered_words  # list of filtered words

In [None]:
get_clean_message(df.at[2, 'MESSAGE'])  # at is used to access the particular cell


# Apply Cleaning and Tokenisation to all messages

#### at[row, column] 
#### at[1, 2] >> at[1, 'the second column'] is the same

In [None]:
# to select more than 1 row, work on datagrame and series
print(type(df.iloc[5:11]))  # iloc >> integer location
print(df['MESSAGE'].iloc[5:11])

In [6]:
%%time

df['MESSAGE'] = df['MESSAGE'].apply(get_clean_message)
df

" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup


Wall time: 30.3 s


Unnamed: 0_level_0,MESSAGE,CATEGORY,file_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,"[save, life, insur, spend, life, quot, save, g...",1,00001.7848dde101aa985090474a91ec93fcf0
1,"[fight, risk, cancer, http, slim, guarante, lo...",1,00002.d94f1b97e48ed3b553b3508d116e6a09
2,"[fight, risk, cancer, http, slim, guarante, lo...",1,00003.2ee33bc6eacdb11f38d052c44819ba6c
3,"[adult, club, offer, free, membership, instant...",1,00004.eac8de8d759b7e74154f142194282724
4,"[thought, might, like, slim, guarante, lose, l...",1,00005.57696a39d7d84318ce497886896bf90d
...,...,...,...
5791,"[http, bizarr, collect, stuf, anim, could, fet...",0,01396.61983fbe6ec43f55fd44e30fce24ffa6
5792,"[care, use, one, also, realli, cute, thing, ja...",0,01397.9f9ef4c2a8dc012d80f2ce2d3473d3b7
5793,"[sm, skip, montanaro, write, jeremi, put, anot...",0,01398.169b51731fe569f42169ae8f948ec676
5794,"[mark, hammond, like, given, zodb, sound, attr...",0,01399.ca6b00b7b341bbde9a9ea3dd6a7bf896


### Using Logic to Slice DataFrames

In [7]:
#df[given narray with true false condition]
spam_emails_index = df[df['CATEGORY'] == 1].index
non_spam_emails_index = df[df['CATEGORY'] == 0].index

In [None]:
print(df.loc[spam_emails_index])
print(df.loc[non_spam_emails_index])
# locate > gets rows (or columns) with particular labels from the index.
# iloc > integer locate > gets rows (or columns) at particular positions in the index (so it only takes integers).

In [8]:
# จากด้านบนเราคิดว่าสามารถย่อมันเหลือแค่นิดเดียวก็ได้นิทำทำไมตั้งหลายรอบ
#df[given narray with true false condition]
spam_emails_df = df[df['CATEGORY'] == 1]
non_spam_emails_df = df[df['CATEGORY'] == 0]

#### misunderstand 

In [None]:
# find the total number of words in clened dataset of both spam and non-spam email bodies
# find 10 common use in both spam and non-spam email bodies
# I do, >> misunderstand about the given command from the teacher
words_len = []
for index in spam_emails_index:
    words_len.append(len(spam_emails_df.loc[index].MESSAGE))
spam_word_amount = sum(words_len)
print('spam word amoun :', spam_word_amount)
for index in non_spam_emails_index:
    words_len.append(len(non_spam_emails_df.loc[index].MESSAGE))
non_spam_word_amount = sum(words_len) - spam_word_amount
print('non-spam word amoun :', non_spam_word_amount)

df['words_amount'] = pd.Series(words_len)
df

#### teacher do

In [9]:
spam_word_series = df['MESSAGE'].loc[spam_emails_index]
non_spam_word_series = df['MESSAGE'].loc[non_spam_emails_index]
spam_words = pd.Series([word for words_list in spam_word_series for word in words_list])
non_spam_words = pd.Series([word for words_list in non_spam_word_series for word in words_list])
print('spam word amoun :', len(spam_words))
print('non-spam word amoun :', len(non_spam_words))

spam word amoun : 320571
non-spam word amoun : 441403


In [None]:
print('total number of word :', spam_words.value_counts().shape[0])
print(spam_words.value_counts()[0:10])

In [None]:
print('total number of word :', non_spam_words.value_counts().shape[0])
print(non_spam_words.value_counts()[0:10])

## Creating a Word Cloud

In [None]:
# preinstall open anaconda termial
# > conda install -c conda-forge wordcloud

In [None]:
word_cloud = WordCloud().generate(email_body)
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
example = nltk.corpus.gutenberg.words('melville-moby_dick.txt')
#print(type(example))
#print(example)

In [None]:
example = [''.join(word) for word in example]
#print(type(example))
#print(example)

In [None]:
example = ' '.join(example)
#print(type(example))
#print(example)

In [None]:
icon = Image.open(WHALE_FILE)
mask = Image.new(mode='RGB', size=icon.size, color=(255, 255, 255))  # create all white image
mask.paste(icon, box=icon)  # paste the black one on it
rgb_array = np.array(mask)  # get new image >> convert to array

word_cloud = WordCloud(mask=rgb_array, background_color='white',
                       max_words=1000, colormap='ocean').generate(example)
plt.figure(figsize=[16, 8])
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis('off')
plt.show()

## Word Cloud of Non-Spam and Spam Message

In [None]:
%%time
icon = Image.open(THUMBS_DOWN_FILE)
mask = Image.new(mode='RGB', size=icon.size, color=(255, 255, 255))  # create all white image
mask.paste(icon, box=icon)  # paste the black one on it
rgb_array = np.array(mask)  # get new image >> convert to array

word_cloud = WordCloud(mask=rgb_array, background_color='white',
                       max_words=1300, colormap='gist_heat', font_path=FONT_PATH)\
                       .generate(' '.join(spam_words))
plt.figure(figsize=[20, 12])
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
%%time
icon = Image.open(THUMBS_UP_FILE)
mask = Image.new(mode='RGB', size=icon.size, color=(255, 255, 255))  # create all white image
mask.paste(icon, box=icon)  # paste the black one on it
rgb_array = np.array(mask)  # get new image >> convert to array

word_cloud = WordCloud(mask=rgb_array, background_color='white',
                       max_words=1000, colormap='gist_heat', font_path=FONT_PATH)\
                       .generate(' '.join(non_spam_words))
plt.figure(figsize=[16, 8])
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis('off')
plt.show()

# Generate Vocabulary & Dictionary

In [12]:
freq_spam_words = spam_words.value_counts()[:VOCAB_SIZE]  # top most 2500 used in spam email
freq_spam_words[0:10]

http      3101
email     3094
free      2555
click     2058
receiv    1987
list      1974
get       1903
pleas     1842
busi      1792
order     1743
dtype: int64

## Create Vocabulary DataFrame with word_id

In [13]:
_id = list(range(0, VOCAB_SIZE))
vocab_df = pd.DataFrame({'WORD': freq_spam_words.index}, index=_id)
vocab_df.to_csv('spam_word.csv')
vocab_df.index.name = 'WORD_ID'
vocab_df

Unnamed: 0_level_0,WORD
WORD_ID,Unnamed: 1_level_1
0,http
1,email
2,free
3,click
4,receiv
...,...
2495,ghetto
2496,indian
2497,hawaiian
2498,civilian


In [14]:
def is_in_vocab(word):
    return True if word in vocab_df['WORD'].values else False

In [None]:
is_in_vocab('free')

In [None]:
## Exsecise find the email with the most number of words 

In [15]:
words_len = []
for index in spam_emails_index:
    words_len.append(len(spam_emails_df.loc[index].MESSAGE))
spam_word_amount = sum(words_len)
print('spam word amoun :', spam_word_amount)
for index in non_spam_emails_index:
    words_len.append(len(non_spam_emails_df.loc[index].MESSAGE))
non_spam_word_amount = sum(words_len) - spam_word_amount
print('non-spam word amoun :', non_spam_word_amount)

df['words_amount'] = pd.Series(words_len)
df.head()

spam word amoun : 320571
non-spam word amoun : 441403


Unnamed: 0_level_0,MESSAGE,CATEGORY,file_name,words_amount
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,"[save, life, insur, spend, life, quot, save, g...",1,00001.7848dde101aa985090474a91ec93fcf0,92
1,"[fight, risk, cancer, http, slim, guarante, lo...",1,00002.d94f1b97e48ed3b553b3508d116e6a09,56
2,"[fight, risk, cancer, http, slim, guarante, lo...",1,00003.2ee33bc6eacdb11f38d052c44819ba6c,44
3,"[adult, club, offer, free, membership, instant...",1,00004.eac8de8d759b7e74154f142194282724,205
4,"[thought, might, like, slim, guarante, lose, l...",1,00005.57696a39d7d84318ce497886896bf90d,45


In [16]:
longest_email = df[df['words_amount'].max() == df['words_amount']]
longest_email

Unnamed: 0_level_0,MESSAGE,CATEGORY,file_name,words_amount
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5775,"[yahoo, group, sponsor, dvd, free, p, join, ht...",0,01380.e3fad5af747d3a110008f94a046bf31b,7671


# Generate Features & Sparse Matrix

### Creating a DaraFrame with one word per data

In [17]:
spam_emails_df['MESSAGE']

id
0       [save, life, insur, spend, life, quot, save, g...
1       [fight, risk, cancer, http, slim, guarante, lo...
2       [fight, risk, cancer, http, slim, guarante, lo...
3       [adult, club, offer, free, membership, instant...
4       [thought, might, like, slim, guarante, lose, l...
                              ...                        
1891    [want, boss, train, home, studi, thousand, peo...
1892    [messag, mime, format, prefer, doctor, order, ...
1893    [dear, subscrib, could, show, way, get, visito...
1894    [custom, appreci, sale, express, appreci, loya...
1895    [attn, strictli, confidenti, pleas, introduc, ...
Name: MESSAGE, Length: 1896, dtype: object

In [19]:
words_per_email_list = df['MESSAGE'].tolist()  # get the list for each MESSAGE of email
words_column_df = pd.DataFrame.from_records(words_per_email_list)
words_column_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7661,7662,7663,7664,7665,7666,7667,7668,7669,7670
0,save,life,insur,spend,life,quot,save,g,famili,financi,...,,,,,,,,,,
1,fight,risk,cancer,http,slim,guarante,lose,lb,day,http,...,,,,,,,,,,
2,fight,risk,cancer,http,slim,guarante,lose,lb,day,http,...,,,,,,,,,,
3,adult,club,offer,free,membership,instant,access,site,user,name,...,,,,,,,,,,
4,thought,might,like,slim,guarante,lose,lb,day,http,fight,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5791,http,bizarr,collect,stuf,anim,could,fetch,sold,cornwal,museum,...,,,,,,,,,,
5792,care,use,one,also,realli,cute,thing,japanes,av,girl,...,,,,,,,,,,
5793,sm,skip,montanaro,write,jeremi,put,anoth,way,interest,hear,...,,,,,,,,,,
5794,mark,hammond,like,given,zodb,sound,attract,would,packag,hundr,...,,,,,,,,,,


In [20]:
X_train, X_test, y_train, y_test = train_test_split(words_column_df, df['CATEGORY'],
                                                   test_size=0.3, random_state=42)

In [71]:
print('Number of training samples', X_train.shape[0])
print('Number of testing samples', X_test.shape[0])
print('Fraction of training set', X_train.shape[0]/words_column_df.shape[0])
X_train.index.name = 'DOC_ID'
X_test.index.name = 'DOC_ID'
X_train.head()

Number of training samples 4057
Number of testing samples 1739
Fraction of training set 0.6999654934437544


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,7661,7662,7663,7664,7665,7666,7667,7668,7669,7670
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4844,ye,inde,agent,directori,verita,cd,unix,subdirectori,file,call,...,,,,,,,,,,
4727,problem,come,tri,instal,harddissssk,like,alreadi,mount,http,yahoo,...,,,,,,,,,,
5022,origin,messag,date,mon,aug,chad,norwood,sven,cc,subject,...,,,,,,,,,,
3504,inlin,folk,sever,major,internet,outag,morn,across,major,provid,...,,,,,,,,,,
3921,url,http,date,bath,chronicl,,,,,,...,,,,,,,,,,


## Create a Sparse Matric for the Training Data

In [22]:
word_index = pd.Index(vocab_df['WORD'])  # top 2500 word's indexs
word_index

Index(['http', 'email', 'free', 'click', 'receiv', 'list', 'get', 'pleas',
       'busi', 'order',
       ...
       'strive', 'sic', 'struggl', 'wet', 'merger', 'ghetto', 'indian',
       'hawaiian', 'civilian', 'earth'],
      dtype='object', name='WORD', length=2500)

In [28]:
# X_train.index[112] >> get the name of index(DOC_id) at index 112 >> (DOC_ID)
# y_train.at[112] >> get the value at index 112 >> (CATEGORY)
# word_index.get_loc('word') >> get the index of giver string


In [29]:
def get_sparse_matrix(df, word_index, labels):
    """
    Return sparse matric as dataframe
    df: A dataframe words in the columns with a document id as an index X_test, X_train
    word_index: index of words ordered by word id
    labels: catagory as a series y_test, t_train
    """
    row_number = df.shape[0]
    column_number = df.shape[1]
    word_set = set(word_index)
    dict_list = []
    
    for i in range(row_number):
        for j in range(column_number):
            word = df.iat[i, j]
            if word in word_set:
                doc_id = df.index[i]
                word_id = word_index.get_loc(word)
                category = labels.at[doc_id]
                
                item = {'LABEL': category, 'DOC_ID': doc_id, 'OCCURENCE': 1, 'WORD_ID': word_id}
                
                dict_list.append(item)
    
    return pd.DataFrame(dict_list)

In [30]:
%%time
sparse_train_df = get_sparse_matrix(X_train, word_index, y_train)

Wall time: 2min 8s


In [31]:
sparse_train_df

Unnamed: 0,LABEL,DOC_ID,OCCURENCE,WORD_ID
0,0,4844,1,370
1,0,4844,1,1499
2,0,4844,1,270
3,0,4844,1,558
4,0,4844,1,172
...,...,...,...,...
404311,1,860,1,48
404312,1,860,1,881
404313,1,860,1,11
404314,1,860,1,3


## Combine Occurrences with Pandas groupby() Method

In [32]:
sparse_train_df = sparse_train_df.groupby(['DOC_ID', 'WORD_ID', 'LABEL']).sum()
sparse_train_df = sparse_train_df.reset_index()
sparse_train_df

Unnamed: 0,DOC_ID,WORD_ID,LABEL,OCCURENCE
0,0,1,1,2
1,0,2,1,3
2,0,3,1,2
3,0,5,1,1
4,0,6,1,1
...,...,...,...,...
241948,5795,2077,0,1
241949,5795,2164,0,7
241950,5795,2271,0,1
241951,5795,2360,0,1


In [33]:
DOC_ID = 5795
WORD_ID = 2197
print('word :', vocab_df.at[WORD_ID, 'WORD'])
print('occure :', df['MESSAGE'][DOC_ID].count(vocab_df.at[WORD_ID, 'WORD']))

word : throughout
occure : 0


## Saving Traning Data as .txt File

In [34]:
np.savetxt(TRAINING_DATA_FILE, sparse_train_df, fmt='%d')

In [None]:
## challenge: create sparse matrix for the test data >> do the same just like we did above and save it

In [35]:
%%time
sparse_test_df = get_sparse_matrix(X_test, word_index, y_test)
sparse_test_df = sparse_test_df.groupby(['DOC_ID', 'WORD_ID', 'LABEL']).sum()
sparse_test_df = sparse_test_df.reset_index()
sparse_test_df

Wall time: 58 s


Unnamed: 0,DOC_ID,WORD_ID,LABEL,OCCURENCE
0,8,1,1,4
1,8,2,1,4
2,8,4,1,5
3,8,5,1,1
4,8,6,1,2
...,...,...,...,...
110983,5793,1918,0,1
110984,5793,1982,0,3
110985,5793,2209,0,1
110986,5793,2252,0,1


In [36]:
np.savetxt(TEST_DATA_FILE, sparse_train_df, fmt='%d')

# Pre-Processing Subtleties and Checking your Understand

We started with 5796 emails. >> split to 4057 emails for training and 1739 emails for testing

1. How many individual emails were include in the traning .txt file? Count the number in test_grouped DataFrame.

After spritting and shuffing our data, how many emails were included in the X_test DataFrame? Is the same number?

In [69]:
df['CATEGORY'].value_counts()

0    3900
1    1896
Name: CATEGORY, dtype: int64

In [64]:
train_doc_ids = set(sparse_train_df['DOC_ID'])
test_doc_ids = set(sparse_test_df['DOC_ID'])
print('Individual train email amount:', len(train_doc_ids))
print('Individual test email amount:', len(test_doc_ids))

Individual train email amount: 4015
Individual test email amount: 1724


In [67]:
set(X_test.index.values) - test_doc_ids 

{134, 179, 240, 274, 298, 339, 439, 471, 670, 734, 765, 945, 1544, 1670, 1700}

In [75]:
df.MESSAGE[670]  # after clenaing there's no msg with all above emails

['kiaqicogkiaqicogkiaqicogkiaqicogkiaqicogkiaqicogkiaqicogkjwv']