In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Sample positive and negative movie reviews
positive_reviews = ["The acting was superb! A must-watch film.",
                    "I loved every moment of this movie. Highly recommended.",
                   "An outstanding performance by the cast.",
                    "A heartwarming story that left me with a smile."]

negative_reviews = ["Terrible acting. I couldn't wait for it to end.",
                    "Boring plot and unconvincing characters.",
                   "A complete waste of time and money.",
                    "The worst movie I've seen in years."]

neutral_reviews = [
    "The movie was okay. It didn't leave a strong impression.",
    "I have mixed feelings about this film. It had its moments.",
    "Not great, not terrible. It's a decent watch."]

# Create a DataFrame
data = {'text': positive_reviews + negative_reviews + neutral_reviews, 'sentiment': ['Positive'] * len(positive_reviews) + ['Negative'] * len(negative_reviews) + ['Neutral'] * len(neutral_reviews)}
df = pd.DataFrame(data)

# Shuffle the DataFrame to mix positive and negative reviews
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Display the DataFrame
print(df)

                                                 text sentiment
0            Boring plot and unconvincing characters.  Negative
1           The acting was superb! A must-watch film.  Positive
2   I have mixed feelings about this film. It had ...   Neutral
3       Not great, not terrible. It's a decent watch.   Neutral
4             An outstanding performance by the cast.  Positive
5   I loved every moment of this movie. Highly rec...  Positive
6   The movie was okay. It didn't leave a strong i...   Neutral
7     Terrible acting. I couldn't wait for it to end.  Negative
8                 The worst movie I've seen in years.  Negative
9     A heartwarming story that left me with a smile.  Positive
10                A complete waste of time and money.  Negative


In [3]:
# start of basic feature engineering 
# count the number of words in each row, the number of characters, the number of nonstop words, and the number of commas 
import re
import nltk
from nltk.corpus import stopwords

stopWords = set(stopwords.words('english'))

#creating a function to encapsulate preprocessing, to make it easy to replicate on  submission data
def processing(df):
    # making a new column in the dataframe that will contain the processed text 
    # for each entry in the "text" column lowercase the text and remove punctuation using regular expression 
    # lowering and removing punctuation
    df['processed'] = df['text'].apply(lambda x: re.sub(r'[^\w\s]','', x.lower()))
    
    #numerical feature engineering
    #total length of sentence
    df['length'] = df['processed'].apply(lambda x: len(x))
    
    #get number of words
    df['words'] = df['processed'].apply(lambda x: len(x.split(' ')))
    df['words_not_stopword'] = df['processed'].apply(lambda x: len([t for t in x.split(' ') if t not in stopWords]))

    # adding new features 
    # 1. number of adjectives, nouns, verbs 
    df['tagged_words'] = df['processed'].apply(lambda x: nltk.pos_tag(nltk.word_tokenize(x)))
    df['adjective_count'] = df['tagged_words'].apply(lambda x: sum(1 for word, tag in x if tag.startswith('JJ')))
    df['noun_count'] = df['tagged_words'].apply(lambda x: sum(1 for word, tag in x if tag.startswith('NN')))
    df['verb_count'] = df['tagged_words'].apply(lambda x: sum(1 for word, tag in x if tag.startswith('VB')))

    #for i in range (5):
        #print(i, ": ", tagged_words[i], "\n")
    
    # get the average word length
    df['avg_word_length'] = df['processed'].apply(lambda x: np.mean([len(t) for t in x.split(' ') if t not in stopWords]) if len([len(t) for t in x.split(' ') if t not in stopWords]) > 0 else 0)
    
    # get the average word length
    df['commas'] = df['text'].apply(lambda x: x.count(','))

    return(df)

df = processing(df)
df.shape[0]
df.head()



Unnamed: 0,text,sentiment,processed,length,words,words_not_stopword,tagged_words,adjective_count,noun_count,verb_count,avg_word_length,commas
0,Boring plot and unconvincing characters.,Negative,boring plot and unconvincing characters,39,5,4,"[(boring, NN), (plot, NN), (and, CC), (unconvi...",1,3,0,8.0,0
1,The acting was superb! A must-watch film.,Positive,the acting was superb a mustwatch film,38,7,4,"[(the, DT), (acting, NN), (was, VBD), (superb,...",0,3,2,6.25,0
2,I have mixed feelings about this film. It had ...,Neutral,i have mixed feelings about this film it had i...,56,11,4,"[(i, NNS), (have, VBP), (mixed, VBN), (feeling...",0,4,3,6.0,0
3,"Not great, not terrible. It's a decent watch.",Neutral,not great not terrible its a decent watch,41,8,4,"[(not, RB), (great, JJ), (not, RB), (terrible,...",3,1,0,6.0,1
4,An outstanding performance by the cast.,Positive,an outstanding performance by the cast,38,6,3,"[(an, DT), (outstanding, JJ), (performance, NN...",1,2,0,8.666667,0


In [11]:
# split data into train and test 
from sklearn.model_selection import train_test_split

features= [c for c in df.columns.values if c not in ['id','text','sentiment']]
numeric_features= [c for c in df.columns.values if c  not in ['id','text','sentiment','processed']]
target = 'sentiment'

print("features: ", features)
print("\nnumeric features: ", numeric_features)
print("\ntarget: \n", df[target])

print("\n dffeatures.head\n",df[features].head())
print("\ndfnumerica_dfeatures.head\n",df[numeric_features].head(), "\n")
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.33, random_state=42)
type(X_train)
print("\n X_train head\n",X_train.head())
print("\n X_train shape\n",X_train.shape)

features:  ['processed', 'length', 'words', 'words_not_stopword', 'tagged_words', 'adjective_count', 'noun_count', 'verb_count', 'avg_word_length', 'commas']

numeric features:  ['length', 'words', 'words_not_stopword', 'tagged_words', 'adjective_count', 'noun_count', 'verb_count', 'avg_word_length', 'commas']

target: 
 0     Negative
1     Positive
2      Neutral
3      Neutral
4     Positive
5     Positive
6      Neutral
7     Negative
8     Negative
9     Positive
10    Negative
Name: sentiment, dtype: object

 dffeatures.head
                                            processed  length  words  \
0            boring plot and unconvincing characters      39      5   
1             the acting was superb a mustwatch film      38      7   
2  i have mixed feelings about this film it had i...      56     11   
3          not great not terrible its a decent watch      41      8   
4             an outstanding performance by the cast      38      6   

   words_not_stopword              

In [13]:
# we want to return one column of dataframe given a particular key value
from sklearn.base import BaseEstimator, TransformerMixin

class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [14]:
# make a mini pipeline that comsists of two steps 
# 1. grab particular column from dataset 
# 2. perform tf-idf on that column and return results 
# what is a transformer ?? 
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

text = Pipeline([
                # Text selector returns the processed column in the dataframe(the column with the clean text)
                ('selector', TextSelector(key='processed')),
                ('tfidf', TfidfVectorizer(stop_words='english'))
            ])

In [17]:
textSel = TextSelector(key='processed')
testDF = textSel.transform(X_train)

print(testDF.iloc[0])
testDF.head()

i have mixed feelings about this film it had its moments


2    i have mixed feelings about this film it had i...
1               the acting was superb a mustwatch film
8                    the worst movie ive seen in years
4               an outstanding performance by the cast
7         terrible acting i couldnt wait for it to end
Name: processed, dtype: object

In [16]:
# print(pipeline[:-1].get_feature_names_out())
# print("before:", X_train.head())
res = text.fit_transform(X_train) # first pass of out X_train data through pipe line
# print("\n\nAfter:", X_train.head())
print(res.get_shape())
#13,117 vectors of dimension 21,516
#print("x_train: \n", X_train, "\n\n")

# when u print out the sparse matrix res, it will return a tuple of (row, col) values that are non
# zero that is why it is sparse, each coloumn corresponds to a different feature (words in this case)
# u can use the vocabulary attribute from the tfidf object to get the dicitonary [word: col index in sparse matrix]
# using this you can reform the sentence to see it 
print(res)
vectorizor = text.named_steps['tfidf']
print(vectorizor.vocabulary_)

(7, 26)
  (0, 12)	0.5206467559864713
  (0, 6)	0.43218152024617124
  (0, 5)	0.5206467559864713
  (0, 11)	0.5206467559864713
  (1, 14)	0.544082434129559
  (1, 20)	0.544082434129559
  (1, 0)	0.4516351457444982
  (1, 6)	0.4516351457444982
  (2, 25)	0.4618042361109319
  (2, 18)	0.4618042361109319
  (2, 9)	0.4618042361109319
  (2, 13)	0.38333717539523177
  (2, 24)	0.4618042361109319
  (3, 1)	0.5773502691896257
  (3, 17)	0.5773502691896257
  (3, 16)	0.5773502691896257
  (4, 4)	0.544082434129559
  (4, 22)	0.544082434129559
  (4, 21)	0.4516351457444982
  (4, 0)	0.4516351457444982
  (5, 23)	0.5206467559864713
  (5, 2)	0.5206467559864713
  (5, 7)	0.5206467559864713
  (5, 21)	0.43218152024617124
  (6, 8)	0.4192570829702294
  (6, 19)	0.4192570829702294
  (6, 10)	0.4192570829702294
  (6, 3)	0.4192570829702294
  (6, 15)	0.4192570829702294
  (6, 13)	0.3480193843688467
{'mixed': 11, 'feelings': 5, 'film': 6, 'moments': 12, 'acting': 0, 'superb': 20, 'mustwatch': 14, 'worst': 24, 'movie': 13, 'ive': 9, 

In [54]:
from sklearn.preprocessing import StandardScaler

length =  Pipeline([
                ('selector', NumberSelector(key='length')),
                ('standard', StandardScaler())
            ])

length.fit_transform(X_train)


array([[ 1.63474835],
       [-0.67961448],
       [-1.32249305],
       [-0.67961448],
       [ 0.09183979],
       [-0.29388734],
       [ 1.24902121]])

In [55]:
words =  Pipeline([
                ('selector', NumberSelector(key='words')),
                ('standard', StandardScaler())
            ])

### start of features other than words ??
words_not_stopword =  Pipeline([
                ('selector', NumberSelector(key='words_not_stopword')),
                ('standard', StandardScaler())
            ])
avg_word_length =  Pipeline([
                ('selector', NumberSelector(key='avg_word_length')),
                ('standard', StandardScaler())
            ])
commas =  Pipeline([
                ('selector', NumberSelector(key='commas')),
                ('standard', StandardScaler()),
            ])

# adding piplines for NEW FEATURES we created:
adjective_count =  Pipeline([
                ('selector', NumberSelector(key='adjective_count')),
                ('standard', StandardScaler()),
            ])

noun_count =  Pipeline([
                ('selector', NumberSelector(key='noun_count')),
                ('standard', StandardScaler()),
            ])

verb_count =  Pipeline([
                ('selector', NumberSelector(key='verb_count')),
                ('standard', StandardScaler()),
            ])

In [61]:
from sklearn.pipeline import FeatureUnion

feats = FeatureUnion([('text', text), 
                      ('length', length),
                      ('words', words),
                      ('words_not_stopword', words_not_stopword),
                      ('avg_word_length', avg_word_length),
                      ('commas', commas),
                      ('adjective_count', adjective_count),
                      ('verb_count', verb_count),
                      ('noun_count', noun_count)])

#print("before:", X_train.head())
feature_processing = Pipeline([('feats', feats)])
# fit transform returns transformed samples shape (n_samples, n_transformed_features)
catch = feature_processing.fit_transform(X_train)
print(type(X_train))

<class 'pandas.core.frame.DataFrame'>


#### What Pipelining is doing: There are 28 features in total and we have 5 samples, this was gotten from using shape on the matrix returned from fit_transform. The purpose of the pipeline in this usecase is to standardize all the numbers in our feature vectors. For example if you look at the length and verb_count features, their values for each sample differ in scale. length is in the 30 and 40s and verb count is less than 5, in order to standardize this as well as the rest of the features all together we pass them throufht the pipeline which scales the numbers to be closer together and not as drastic. 

In [57]:
print(X_train)
print(type(catch))
print(catch.shape)
print(catch[2])

                                           processed  length  words  \
2  i have mixed feelings about this film it had i...      56     11   
1             the acting was superb a mustwatch film      38      7   
8                  the worst movie ive seen in years      33      7   
4             an outstanding performance by the cast      38      6   
7       terrible acting i couldnt wait for it to end      44      9   
3          not great not terrible its a decent watch      41      8   
6  the movie was okay it didnt leave a strong imp...      53     10   

   words_not_stopword                                       tagged_words  \
2                   4  [(i, NNS), (have, VBP), (mixed, VBN), (feeling...   
1                   4  [(the, DT), (acting, NN), (was, VBD), (superb,...   
8                   5  [(the, DT), (worst, JJS), (movie, NN), (ive, J...   
4                   3  [(an, DT), (outstanding, JJ), (performance, NN...   
7                   5  [(terrible, JJ), (acting, VB

In [59]:
from sklearn.linear_model import LogisticRegression
# solver = sag for real multinomoial class 
pipeline = Pipeline([
    ('features',feats),
    ('classifier', LogisticRegression(multi_class = 'multinomial', random_state = 42, max_iter = 2000)),
])

pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_test)
np.mean(preds == y_test)

0.0

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [60]:
## testing if my thinking is correct 
adj = pipeline.named_steps['features'].transformer_list[6][1]
print(adj.named_steps)

# the coefficiant matrix is 3 by 21524, 3 because we have 3 different classes to predict and we have 21k columns 
# for the number of features we have for each feature vector 
print(pipeline.named_steps['classifier'].coef_.shape)

model = pipeline.named_steps['classifier']
print(model.classes_)
#print("y train:\n" , y_train.head())

#print(pipeline.named_steps['features'].transformer_list[6][1].named_steps['standard'].get_feature_names_out())
#print(len(pipeline.named_steps['features'].transformer_list[0][1].named_steps['tfidf'].get_feature_names_out()))


{'selector': NumberSelector(key='adjective_count'), 'standard': StandardScaler()}
(3, 34)
['Negative' 'Neutral' 'Positive']


## 1b. Printing weights learned for three new features

#### Note: in the above output we see that the total number of features is 21,524 and the number of features(unique words) in the first transformer in the featureUnion we created is 21516 so this means that feature 'adjective_count' is feature number 21,522 (21516+6, bc it is 6 places away from the text transformer in the transformer list). We will use this information to get the weights learned for the features adjective_count, verb_count, noun_count

In [151]:
print("New Feature Weights for EAP")
print("adjective_count: ", pipeline.named_steps['classifier'].coef_[0][21521])
print("verb_count: ", pipeline.named_steps['classifier'].coef_[0][21522])
print("noun_count: ", pipeline.named_steps['classifier'].coef_[0][21523])
print("\n")
print("New Feature Weights for HPL")
print("adjective_count: ", pipeline.named_steps['classifier'].coef_[1][21521])
print("verb_count: ", pipeline.named_steps['classifier'].coef_[1][21522])
print("noun_count: ", pipeline.named_steps['classifier'].coef_[1][21523])
print("\n")
print("New Feature Weights for MWS")
print("adjective_count: ", pipeline.named_steps['classifier'].coef_[2][21521])
print("verb_count: ", pipeline.named_steps['classifier'].coef_[2][21522])
print("noun_count: ", pipeline.named_steps['classifier'].coef_[2][21523])


New Feature Weights for EAP
adjective_count:  -0.1366969613723863
verb_count:  -0.599221179886205
noun_count:  -0.33187619939253094


New Feature Weights for HPL
adjective_count:  0.15459057609435142
verb_count:  0.10706833808475975
noun_count:  -0.09587487039082906


New Feature Weights for MWS
adjective_count:  -0.017893614721936915
verb_count:  0.4921528418014121
noun_count:  0.42775106978334226


In [None]:
# Step 2c: printing evaluation report 
print(classification_report(y_test, preds))

In [None]:
# a list of all the hyperparameters we could finetune 
pipeline.get_params()

In [None]:
pipeline.get_params().keys()

## 1a. Different Cross Validation Settings: 2-fold, 10-fold and 20-fold

In [124]:
from sklearn.model_selection import GridSearchCV

# since we are using logistic regression now classifier__max_depth is not a hyper parameter we can finetune
# look at the list of hyperparameters above to see what we can finetune instead 
hyperparameters = { 'features__text__tfidf__max_df': [0.9, 0.95],
                    'features__text__tfidf__ngram_range': [(1,1), (1,2)],
                    'classifier__class_weight': ['balanced'],
                    'classifier__solver': ['lbfgs', 'newton-cg', 'liblinear'],
                    'classifier__max_iter':[500,1000,1500],
                  }
# this is where we do cross validation 
# we want to see what are the best tunings for our hyperparameters so we need to split the 
# data three ways so that we have a batch to see what the best hyperparameters are 
clf2 = GridSearchCV(pipeline, hyperparameters, cv=2)
clf10 = GridSearchCV(pipeline, hyperparameters, cv=10)
clf20 = GridSearchCV(pipeline, hyperparameters, cv=20)
 
# Fit and tune model
clf2.fit(X_train, y_train)

In [None]:
clf2DF = pd.DataFrame(clf.cv_results_)
print(clf2DF)

In [120]:
clf2.best_params_

{'classifier__class_weight': 'balanced',
 'classifier__max_iter': 500,
 'classifier__solver': 'newton-cg',
 'features__text__tfidf__max_df': 0.9,
 'features__text__tfidf__ngram_range': (1, 1)}

In [129]:
#refitting on entire training data using best settings
clf2.refit

preds = clf2.predict(X_test)
probs = clf2.predict_proba(X_test)



In [130]:
print("2-fold cross validation:", np.mean(preds == y_test), "\n")

2-fold cross validation: 0.7816465490560198 



In [122]:
# Printing Evaluation report 
print("clf2: \n", classification_report(y_test, preds))

clf2: 
               precision    recall  f1-score   support

         EAP       0.78      0.78      0.78      2587
         HPL       0.77      0.80      0.78      1852
         MWS       0.79      0.77      0.78      2023

    accuracy                           0.78      6462
   macro avg       0.78      0.78      0.78      6462
weighted avg       0.78      0.78      0.78      6462



In [126]:
clf10.fit(X_train, y_train)
clf10.best_params_
clf10.refit

preds = clf10.predict(X_test)
probs = clf10.predict_proba(X_test)

np.mean(preds == y_test)
print("clf10: \n", classification_report(y_test, preds))

clf10: 
               precision    recall  f1-score   support

         EAP       0.78      0.78      0.78      2587
         HPL       0.77      0.80      0.78      1852
         MWS       0.79      0.77      0.78      2023

    accuracy                           0.78      6462
   macro avg       0.78      0.78      0.78      6462
weighted avg       0.78      0.78      0.78      6462



In [128]:
print("10-fold cross validation:", np.mean(preds == y_test), "\n")

10-fold cross validation: 0.7818012999071495 



In [131]:
clf20.fit(X_train, y_train)
clf20.best_params_
clf20.refit

preds = clf20.predict(X_test)
probs = clf20.predict_proba(X_test)

print("clf20: \n", classification_report(y_test, preds))

clf20: 
               precision    recall  f1-score   support

         EAP       0.78      0.78      0.78      2587
         HPL       0.77      0.80      0.78      1852
         MWS       0.79      0.77      0.78      2023

    accuracy                           0.78      6462
   macro avg       0.78      0.78      0.78      6462
weighted avg       0.78      0.78      0.78      6462



In [132]:
print("20-fold cross validation:", np.mean(preds == y_test), "\n")

20-fold cross validation: 0.7816465490560198 



## 1b. Feature Importance: printing out the 10 most important and least important features for each class, EAP, HPL, and MWS

In [42]:
# getting the logistic regression model out of the pipeline,
# we use "model dot x" to access attributes of the logistic regression class in sklearn, for example, one we will be 
# using is model.coef_ to get the learned wieghts
model = pipeline.named_steps['classifier']

# getting the features out of the pipeline which is a feature union using named_steps for the same reasons as above
features = pipeline.named_steps['features']

print(model.classes_, "\n")

# example of how to get the learned weights of the features, since we have 3 classes 
# model.coef_[0] corresponds to the weights for class EAP
# model.coef_[1] corresponds to the weights for class HPL ect..
EAP_coef = model.coef_[0]

# feature union has an attribute  transformer_list where we use named_steps to access
# tfidf. TDIDF comes from the text pipeline we created earlier, it was the first pipeline we created
# From tdif we can get the names of the feautres in this case it is the words 
feat_names = features.transformer_list[0][1].named_steps['tfidf'].get_feature_names_out()

# Create dictionaries to store the top features for each class
top_features = {class_label: [] for class_label in model.classes_}
bottom_features = {class_label: [] for class_label in model.classes_}

# Loop through each class
for class_label, class_coef in zip(model.classes_, model.coef_):
    feature_coef_pairs = list(zip(feat_names, class_coef)) # pair together the word with its weight 
    feature_coef_pairs.sort(key=lambda x: x[1], reverse=True)

    # store the top 10 features for the current class
    top_features[class_label] = feature_coef_pairs[:10]

    # store bottom ten features for the class by first reversing the order then taking the top ten 
    feature_coef_pairs.sort(key=lambda x: x[1], reverse=False)
    bottom_features[class_label] = feature_coef_pairs[:10]

# Print the top 10 features for each class
for class_label, top_feats in top_features.items():
    print(f"Top 10 Features for Class {class_label}:")
    for i, (feature_name, coefficient) in enumerate(top_feats, 1):
        print(f"{i}. Feature: {feature_name}, Weight: {coefficient}")
    print("\n")

# Print the bottom 10 features for each class
for class_label, bottom_feats in bottom_features.items():
    print(f"Bottom 10 Features for Class {class_label}:")
    for i, (feature_name, coefficient) in enumerate(bottom_feats, 1):
        print(f"{i}. Feature: {feature_name}, Weight: {coefficient}")
    print("\n")
    


['EAP' 'HPL' 'MWS'] 

Top 10 Features for Class EAP:
1. Feature: mr, Weight: 2.2072667836715794
2. Feature: madame, Weight: 2.1814090246927336
3. Feature: gentleman, Weight: 2.0128172870292076
4. Feature: balloon, Weight: 1.8273713598905108
5. Feature: minutes, Weight: 1.8248577255162723
6. Feature: lady, Weight: 1.8125028671224843
7. Feature: altogether, Weight: 1.7457028808238926
8. Feature: dupin, Weight: 1.743151411316225
9. Feature: matter, Weight: 1.7083742939592135
10. Feature: character, Weight: 1.6302574776783456


Top 10 Features for Class HPL:
1. Feature: west, Weight: 2.5471953018543907
2. Feature: street, Weight: 2.3649235531347683
3. Feature: later, Weight: 2.3412122546684406
4. Feature: gilman, Weight: 2.270183461978333
5. Feature: innsmouth, Weight: 2.0153998008812235
6. Feature: men, Weight: 1.9948854499674662
7. Feature: despite, Weight: 1.940749033477425
8. Feature: ancient, Weight: 1.8741202240029613
9. Feature: outside, Weight: 1.8584574038628214
10. Feature: jermy

### 1c. Error Analysis

In [41]:
incorrectly_predicted_indices = np.where(preds != y_test)[0]
print("---Incorrect Predicitons---")
for i in range (10):
    print(i+1 , " predicted: \t", preds[incorrectly_predicted_indices[i]], "\t ground truth: ", y_test[incorrectly_predicted_indices[i]])
    print("feature vector:\n",  X_test.iloc[i])
    print("\n")


---Incorrect Predicitons---
1  predicted: 	 HPL 	 ground truth:  MWS
feature vector:
 processed             the gigantic magnitude and the immediately ava...
length                                                              124
words                                                                20
words_not_stopword                                                   11
tagged_words          [(the, DT), (gigantic, JJ), (magnitude, NN), (...
adjective_count                                                       2
noun_count                                                            4
verb_count                                                            3
avg_word_length                                                7.181818
commas                                                                1
Name: id15695, dtype: object


2  predicted: 	 EAP 	 ground truth:  MWS
feature vector:
 processed             shall i disturb this calm by mingling in the w...
length                           

In [114]:
submission = pd.read_csv('./HA9-data/test.csv')

#preprocessing
submission = processing(submission)
predictions = clf.predict_proba(submission)

preds = pd.DataFrame(data=predictions, columns = clf.best_estimator_.named_steps['classifier'].classes_)

#generating a submission file
result = pd.concat([submission[['id']], preds], axis=1)
result.set_index('id', inplace = True)
result.head()

Unnamed: 0_level_0,EAP,HPL,MWS
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id02310,0.199965,0.063969,0.736066
id24541,0.795038,0.048417,0.156545
id00134,0.190049,0.748933,0.061018
id27757,0.64843,0.233289,0.11828
id04081,0.620507,0.274016,0.105477


### Model Performance Observations:
#### 1. After the addition of the three features the models performance went up quite significantly. The proportion of correctly classified text from the random forest version with the initial features was 0.671. After adding the three new features it went up to 0.77. I believe this is because the addition of adjective, verb and noun count better potrays the styles of each author and so it helps when classifying the text.
#### 2. The features we were initially using, length, words, commas, are not very useful when trying to distinguish bewteen each authors writing style. Theses are all features that don't show much uniqueness in how the three authors write. Whereas the amount of verbs, adjectives and nouns tell more about the style of the writer and therfore helps the model predict the pieces of text better.
### 3. Lastly, the change from using a random forest classifier to a logistic regression classifier might have helped the precision be a little better since from what I understand about random forest classifiers is that since they are more complex they have the potential to overfit when dealing with smaller datasets. Whereas logistic regression is a simpler model and is less prone to overfitting perhaps. 