# Data Analysis for Masters Thesis 1

### Title: Stemming algorithms for English

### Algorithms being studied

- Porter
- Lancaster
- Snowball (Porter2)

#### Importing Required Packages

In [1]:
from nltk.stem import PorterStemmer, LancasterStemmer, snowball
from nltk.corpus import brown
import pandas as pd

#### Extracting words from the corpora and preparing them for further processing

In [2]:
words = brown.words()
corpus = ' '.join(words)

Total number of unique tokens:

In [3]:
print(len(set(words)))

56057


In [4]:
corpus = corpus.lower()
corpus = corpus.replace("\n"," ")
corpus = corpus.replace("-", " ")
cleaned_corpus = [i for i in corpus if i.isalpha() or i==' ']
cleaned_corpus = ''.join(cleaned_corpus)
rev_corpus = [i[::-1] for i in cleaned_corpus]

**Removing functional words and sorting according to the reverse of the spelling so as it order it according to the word endings.**

In [5]:
functional_words = [
    "the", "a", "an",
    "in", "on", "at", "by", "for", "with", "to", "from", "of", "about", "through", "between", "among", "under", "over",
    "and", "but", "or", "nor", "for", "so", "yet",
    "although", "because", "if", "unless", "since", "while", "when", "after", "before", "as", "though",
    "I", "you", "he", "she", "it", "we", "they", "me", "you", "him", "her", "us", "them",
    "my", "your", "his", "her", "its", "our", "their", "mine", "yours", "hers", "ours", "theirs",
    "myself", "yourself", "himself", "herself", "itself", "ourselves", "yourselves", "themselves",
    "who", "whom", "whose", "which", "that",
    "this", "that", "these", "those",
    "who", "whom", "whose", "which", "what",
    "all", "another", "any", "anybody", "anyone", "anything", "both", "each", "either", "everybody", "everyone", "everything", "neither", "nobody", "no one", "nothing", "several", "some", "somebody", "someone", "something",
    "can", "could", "may", "might", "must", "shall", "should", "will", "would",
    "am", "is", "are", "was", "were", "be", "being", "been", "have", "has", "had", "do", "does", "did",
    "also", "not", "never", "always", "very", "too", "so", "such", "here", "there", "now", "then", "when", "where",
    "today", "yesterday", "tomorrow", "soon", "now", "then", "already", "lately",
    "always", "usually", "often", "sometimes", "seldom", "never"
]

words = cleaned_corpus.split()

primary_dataset = [i for i in words if not i in functional_words]
primary_dataset = list(set(primary_dataset))
primary_dataset = [i[::-1] for i in primary_dataset]
primary_dataset.sort()
primary_dataset = [i[::-1] for i in primary_dataset]

**Number of words:**

In [6]:
len(primary_dataset)

42551

### Applying systematic sampling (selecting every 10th word) total of 10% of the dataset.

In [7]:
sample_systematic = []

for i in range(0, len(primary_dataset), 10):
    sample_systematic.append(primary_dataset[i])

print("Sample size: ", len(sample_systematic))
print("Sample preview: ", sample_systematic[0:30])

Sample size:  4256
Sample preview:  ['aa', 'elba', 'tuba', 'jamaica', 'veronica', 'atlantica', 'dellarca', 'ywca', 'hedda', 'salida', 'veranda', 'tenda', 'soda', 'medea', 'anthea', 'andrea', 'hoffa', 'bottega', 'ticonderoga', 'mischa', 'pasha', 'bertha', 'suburbia', 'acadia', 'pharmacopoeia', 'bahia', 'malia', 'anglia', 'julia', 'lumia']


### Creating objects for each algorithm

In [8]:
stemmer_porter = PorterStemmer()
stemmer_lancaster = LancasterStemmer()
stemmer_snowball = snowball.EnglishStemmer()

### Applying stemming over the primary dataset and the sample dataset

In [9]:
def stem_all(dataset):
    stemmed_lancaster = [stemmer_lancaster.stem(i) for i in dataset]
    stemmed_porter = [stemmer_porter.stem(i) for i in dataset]
    stemmed_snowball = [stemmer_snowball.stem(i) for i in dataset]

    df_processed = pd.DataFrame({
    "Original" : dataset,
    "Porter" : stemmed_porter,
    "Lancaster" : stemmed_lancaster,
    "Snowball" : stemmed_snowball,
    }, index=range(1,len(dataset)+1))

    return df_processed

In [10]:
df_primary = stem_all(primary_dataset)
df_sample_systematic = stem_all(sample_systematic)