In [164]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score


In [165]:
#Split the data into sentences
with open('/Users/vivek/SororityProject/Data/snippets_1.txt') as f:
    lines = f.readlines()
corpus =  lines[0]
sentences = corpus.split(".")
sentences[:5]

['I love this girl already, she was so incredibly excited to be here in delta zeta specifically(T,I,E)',
 ' She wanted to know all about what made me choose DZ, what the house is like, my favorite part about the girls, etc(T,I,E)',
 ' She kept complimenting our chapter on how well we did everything and how easy it is to talk to us yesterday and today(A,I,E)',
 ' She said that she sees us as leaders and aspires to be like us(A,I,E)',
 ' She wants to be a teacher and was very interested in SeriousFun(I,I,E)']

In [166]:
#Merge into dataframe

#Sentences
#Columns
#Column mappings

df = pd.DataFrame(columns= ['Text', 'PI Label', 'EI Label'])
ind = 0
for sentence in sentences:
    tup_ind = sentence.find("(")
    text = sentence[:tup_ind]
    if tup_ind != -1:
        label_tup = sentence[tup_ind:][1:-1]
        labels = tuple(label_tup.split(","))
        if len(labels) == 3:
            df.loc[ind] = [text, labels[1], labels[2]]
            ind += 1

df.replace({"PI Label": {'I':1, 'N':0, 'P':2, ' N':0, ' I':1, ' P':2, 'N ':0, 'I ':1, 'P ':2, ' N ':0, ' I ':1, ' P ':2}}, inplace=True)
df.replace({"EI Label": {'E':1, 'N':0, 'I':2, ' E':1, ' N':0, ' I':2, 'E ':1, 'N ':0, 'I ':2, ' E ':1, ' N ':0, ' I ':2}}, inplace=True)




            
        


In [167]:
# Step - a : Remove blank rows if any.
df['Text'].dropna(inplace=True)
# Step - b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
df['Text'] = [entry.lower() for entry in df['Text']]
# Step - c : Tokenization : In this each entry in the corpus will be broken into set of words
df['Text']= [word_tokenize(entry) for entry in df['Text']]
# Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(df['Text']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    df.loc[index,'text_final'] = str(Final_words)
df
    


Unnamed: 0,Text,PI Label,EI Label,text_final
0,"[i, love, this, girl, already, ,, she, was, so...",1,1,"['love', 'girl', 'already', 'incredibly', 'exc..."
1,"[she, wanted, to, know, all, about, what, made...",1,1,"['want', 'know', 'make', 'choose', 'dz', 'hous..."
2,"[she, kept, complimenting, our, chapter, on, h...",1,1,"['keep', 'compliment', 'chapter', 'well', 'eve..."
3,"[she, said, that, she, sees, us, as, leaders, ...",1,1,"['say', 'see', 'u', 'leader', 'aspires', 'like..."
4,"[she, wants, to, be, a, teacher, and, was, ver...",1,1,"['want', 'teacher', 'interested', 'seriousfun']"
...,...,...,...,...
96,"[having, reviewed, the, sorority, candidate, '...",0,0,"['review', 'sorority', 'candidate', 'applicati..."
97,"[her, passion, for, promoting, diversity, and,...",1,1,"['passion', 'promote', 'diversity', 'inclusion..."
98,"[her, past, involvement, in, leadership, posit...",1,1,"['past', 'involvement', 'leadership', 'positio..."
99,"[her, friendly, and, outgoing, demeanor, would...",1,1,"['friendly', 'outgoing', 'demeanor', 'would', ..."


In [168]:
#Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(df['text_final'], df['PI Label'], test_size=0.3)
Test_X, Test_Y = df['text_final'].iloc[:24], df['PI Label'].iloc[:24]
Train_X, Train_Y = df['text_final'].iloc[24:], df['PI Label'].iloc[24:]

In [169]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(df['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)

# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy

print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)







Naive Bayes Accuracy Score ->  91.66666666666666


In [170]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  91.66666666666666
