## Data Analysis for Masters Thesis 1

### Title: Stemming algorithms for English

### Algorithms being studied

- Porter
- Lancaster
- Snowball (Porter2)

#### Importing Required Packages

In [1]:
from nltk.stem import PorterStemmer, LancasterStemmer, snowball
from nltk.corpus import brown
import pandas as pd

#### Extracting words from the corpora and preparing them for further processing

In [2]:
words = brown.words()
corpus = ' '.join(words)

In [3]:
print("Total number of words in the corpus: ", len(words))
print("Total number of unique tokens: ", len(set(words)))

Total number of words in the corpus:  1161192
Total number of unique tokens:  56057


In [4]:
corpus = corpus.lower()
corpus = corpus.replace("\n"," ")
corpus = corpus.replace("-", " ")
cleaned_corpus = [i for i in corpus if i.isalpha() or i==' ']
cleaned_corpus = ''.join(cleaned_corpus)
rev_corpus = [i[::-1] for i in cleaned_corpus]

#### **Removing functional words and sorting according to the reverse of the spelling so as it order it according to the word endings.**

In [5]:
functional_words = [
    "the", "a", "an",
    "in", "on", "at", "by", "for", "with", "to", "from", "of", "about", "through", "between", "among", "under", "over",
    "and", "but", "or", "nor", "for", "so", "yet",
    "although", "because", "if", "unless", "since", "while", "when", "after", "before", "as", "though",
    "I", "you", "he", "she", "it", "we", "they", "me", "you", "him", "her", "us", "them",
    "my", "your", "his", "her", "its", "our", "their", "mine", "yours", "hers", "ours", "theirs",
    "myself", "yourself", "himself", "herself", "itself", "ourselves", "yourselves", "themselves",
    "who", "whom", "whose", "which", "that",
    "this", "that", "these", "those",
    "who", "whom", "whose", "which", "what",
    "all", "another", "any", "anybody", "anyone", "anything", "both", "each", "either", "everybody", "everyone", "everything", "neither", "nobody", "no one", "nothing", "several", "some", "somebody", "someone", "something",
    "can", "could", "may", "might", "must", "shall", "should", "will", "would",
    "am", "is", "are", "was", "were", "be", "being", "been", "have", "has", "had", "do", "does", "did",
    "also", "not", "never", "always", "very", "too", "so", "such", "here", "there", "now", "then", "when", "where",
    "today", "yesterday", "tomorrow", "soon", "now", "then", "already", "lately",
    "always", "usually", "often", "sometimes", "seldom", "never"
]

words = cleaned_corpus.split()

primary_dataset = [i for i in words if not i in functional_words]
primary_dataset = list(set(primary_dataset))
primary_dataset = [i[::-1] for i in primary_dataset]
primary_dataset.sort()
primary_dataset = [i[::-1] for i in primary_dataset]

In [6]:
print("Number of words in the primary dataset: ", len(primary_dataset))

Number of words in the primary dataset:  42551


### Applying systematic sampling (selecting every 10th word) total of 10% of the dataset.

In [7]:
sample_systematic = []

for i in range(0, len(primary_dataset), 10):
    sample_systematic.append(primary_dataset[i])

print("Sample size: ", len(sample_systematic))
print("Sample preview: ", sample_systematic[0:30])

Sample size:  4256
Sample preview:  ['aa', 'elba', 'tuba', 'jamaica', 'veronica', 'atlantica', 'dellarca', 'ywca', 'hedda', 'salida', 'veranda', 'tenda', 'soda', 'medea', 'anthea', 'andrea', 'hoffa', 'bottega', 'ticonderoga', 'mischa', 'pasha', 'bertha', 'suburbia', 'acadia', 'pharmacopoeia', 'bahia', 'malia', 'anglia', 'julia', 'lumia']


### Creating objects for each algorithm

In [8]:
stemmer_porter = PorterStemmer()
stemmer_lancaster = LancasterStemmer()
stemmer_snowball = snowball.EnglishStemmer()

### Applying stemming over the primary dataset and the sample dataset

In [9]:
def stemAll(dataset):
    stemmed_lancaster = [stemmer_lancaster.stem(i) for i in dataset]
    stemmed_porter = [stemmer_porter.stem(i) for i in dataset]
    stemmed_snowball = [stemmer_snowball.stem(i) for i in dataset]

    df_processed = pd.DataFrame({
    "Original" : dataset,
    "Porter" : stemmed_porter,
    "Lancaster" : stemmed_lancaster,
    "Snowball" : stemmed_snowball,
    }, index=range(1,len(dataset)+1))

    return df_processed

In [10]:
df_primary = stemAll(primary_dataset)
df_sample_systematic = stemAll(sample_systematic)
print(df_primary.head())
print()
print(df_sample_systematic.head())

  Original   Porter Lancaster Snowball
1       aa       aa        aa       aa
2      aaa      aaa        aa      aaa
3       ba       ba        ba       ba
4  barnaba  barnaba    barnab  barnaba
5     paba     paba       pab     paba

   Original    Porter Lancaster  Snowball
1        aa        aa        aa        aa
2      elba      elba       elb      elba
3      tuba      tuba       tub      tuba
4   jamaica   jamaica    jamaic   jamaica
5  veronica  veronica   veronic  veronica


#### Average lenght of Original words and stems produced

In [26]:
len_original = len_porter = len_lancaster = len_snowball = 0

len_primary = len(df_primary)

for i in range(1, len_primary+1):
    len_original += len(df_primary['Original'][i])
    len_porter += len(df_primary['Porter'][i])
    len_lancaster += len(df_primary['Lancaster'][i])
    len_snowball += len(df_primary['Snowball'][i])

print(f"Original: {len_original/len_primary}\nPorter: {len_porter/len_primary}\nLancaster: {len_lancaster/len_primary}\nSnowball: {len_snowball/len_primary}")

Original: 7.754858875232075
Porter: 6.345867312166576
Lancaster: 5.53432351766116
Snowball: 6.321825574017062


### Grouping outputs on the basis of stem produced by each algorithm

In [11]:
# Grouping by porter
group_porter_primary = df_primary.groupby("Porter").agg(lambda x: x.tolist())
group_porter_sample_systematic = df_sample_systematic.groupby("Porter").agg(lambda x: x.tolist())

# Grouping by lancaster
group_lancaster_primary = df_primary.groupby("Lancaster").agg(lambda x: x.tolist())
group_lancaster_sample_systematic = df_sample_systematic.groupby("Lancaster").agg(lambda x: x.tolist())

# Grouping by snowball
group_snowball_primary = df_primary.groupby("Snowball").agg(lambda x: x.tolist())
group_snowball_sample_systematic = df_sample_systematic.groupby("Snowball").agg(lambda x: x.tolist())

### Words for which all algorithms produce a common stem

In [12]:
common_stems_primary = df_primary[(df_primary["Lancaster"] == df_primary["Porter"]) & (df_primary["Porter"] == df_primary["Snowball"])].reset_index()
common_stems_sample_sys = df_sample_systematic[(df_sample_systematic["Lancaster"] == df_sample_systematic["Porter"]) & (df_sample_systematic["Porter"] == df_sample_systematic["Snowball"])].reset_index()
common_stems_sample_sys

Unnamed: 0,index,Original,Porter,Lancaster,Snowball
0,1,aa,aa,aa,aa
1,73,spa,spa,spa,spa
2,104,cab,cab,cab,cab
3,105,grab,grab,grab,grab
4,107,caleb,caleb,caleb,caleb
...,...,...,...,...,...
2381,4252,merz,merz,merz,merz
2382,4253,livshitz,livshitz,livshitz,livshitz
2383,4254,markovitz,markovitz,markovitz,markovitz
2384,4255,schwartz,schwartz,schwartz,schwartz


### Eliminating words which are not stemmed

In [13]:
common_stems_primary_stemmed = common_stems_primary[common_stems_primary["Original"] != common_stems_primary["Porter"]].reset_index()
common_stems_sys_stemmed = common_stems_primary[common_stems_primary["Original"] != common_stems_primary["Porter"]].reset_index()
common_stems_primary_stemmed = common_stems_primary_stemmed.drop(["level_0", "index"], axis = 1)
common_stems_sys_stemmed = common_stems_sys_stemmed.drop(["level_0", "index"], axis = 1)

### Reduced size of the dataset after application of algorithms

In [14]:
print(f"Original: {len(primary_dataset)}\nPorter: {len(group_porter_primary)}\nLancaster: {len(group_lancaster_primary)}\nSnowball: {len(group_snowball_primary)}")

Original: 42551
Porter: 26202
Lancaster: 21361
Snowball: 25797


### Where ouput from snowball varies from Porter

In [15]:
porter_vs_snowball = df_primary[df_primary["Porter"] != df_primary["Snowball"]].reset_index()
porter_vs_snowball = porter_vs_snowball.drop("Lancaster", axis=1)

# Writing all of this to the output files

In [16]:
df_primary.to_csv("./outputs/primary_dataset.csv")
df_sample_systematic.to_csv("./outputs/sample_dataset.csv")
group_porter_primary.to_csv("./outputs/group_porter_primary.csv")
group_porter_sample_systematic.to_csv("./outputs/group_porter_sample.csv")
group_lancaster_primary.to_csv("./outputs/group_lancaster_primary.csv")
group_lancaster_sample_systematic.to_csv("./outputs/group_lancaster_sample.csv")
group_snowball_primary.to_csv("./outputs/group_snowball_primary.csv")
group_snowball_sample_systematic.to_csv("./outputs/group_snowball_sample.csv")
common_stems_primary.to_csv("./outputs/common_stems_primary.csv")
common_stems_sample_sys.to_csv("./outputs/common_stems_sample.csv")
common_stems_primary_stemmed.to_csv("./outputs/common_stems_primary_stemmed.csv")
common_stems_sys_stemmed.to_csv("./outputs/common_stems_sample_stemmed.csv")
porter_vs_snowball.to_csv("./outputs/porter_vs_snowball.csv")