In [1]:
import pandas as pd
import numpy as np
import time
import csv
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import FunctionTransformer, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from gensim.models import KeyedVectors
from scipy.sparse import hstack
from sklearn.metrics import classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.optimizers import Adam

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/james/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
vocab_size = 20000  
embedding_dim = 100  
max_length = 100  
num_filters = 128  
kernel_size = 5  
num_classes = 2 

In [None]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    Conv1D(filters=num_filters, kernel_size=kernel_size, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(10, activation='relu'),
    Dense(num_classes, activation='softmax')
])

In [2]:
wordVec = KeyedVectors.load("word2vec-google-news-300.model", mmap='r')

In [3]:
def textLen(text):
    lengths = []

    for t in text:
        count = len(t.split())
        lengths.append([count])
    
    return np.array(lengths).reshape(-1, 1)

In [4]:
def meanEmbed(text, wordVec):
    mean = []
    
    for t in text:
        wordVecs = []

        for word in t.split():
            if word in wordVec:
                wordVecs.append(wordVec[word])
    
        if not wordVecs:
            wordVecs.append(np.zeros(wordVec.vector_size))

        tempmean = np.mean(wordVecs, axis=0)
        mean.append(tempmean)

    return np.array(mean)

In [5]:
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.preprocessing import FunctionTransformer

def vader_sentiment_features(texts):
    sia = SentimentIntensityAnalyzer()
    sentiment_scores = []
    for text in texts:
        scores = sia.polarity_scores(text)
        sentiment_scores.append([scores['neg'], scores['neu'], scores['pos'], scores['compound']])
    return np.array(sentiment_scores)


In [17]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/james/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/james/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/james/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:
# WORDNET LEMMATIZER (with appropriate pos tags)
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet

lemmatizer = WordNetLemmatizer()

# Define function to lemmatize each word with its POS tag

# POS_TAGGER_FUNCTION : TYPE 1
def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def lemmatize_with_pos(tokens):
  # tokenize the sentence and find the POS tag for each token
  pos_tagged = nltk.pos_tag(tokens)

  # print(pos_tagged)

  wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tagged))
  # print(wordnet_tagged)

  lemmatized_sentence = []
  for word, tag in wordnet_tagged:
      if tag is None:
          # if there is no available tag, append the token as is
          lemmatized_sentence.append(word)
      else:
          # else use the tag to lemmatize the token
          lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
  # lemmatized_sentence = " ".join(lemmatized_sentence)

  # print(lemmatized_sentence)
  return lemmatized_sentence

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/james/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [19]:
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import FunctionTransformer

def nltk_preprocess(texts):
    processed_texts = []
    for text in texts:
        tokens = word_tokenize(text)

        tokens = [w for w in tokens if w.isalpha()]

        lemmatized_tokens = lemmatize_with_pos(tokens)

        processed_texts.append(' '.join(lemmatized_tokens))
    
    return processed_texts

# Wrap the preprocessing function in a FunctionTransformer
nltk_preprocess_transformer = FunctionTransformer(nltk_preprocess)


In [32]:
pipeline = Pipeline([
    ('features', FeatureUnion([
        ('text', Pipeline([
            ('vect', TfidfVectorizer(ngram_range=(1,2), max_df=0.9, min_df=2, max_features=10000)),
            # ('to_dense', FunctionTransformer(lambda x: x.toarray(), accept_sparse=True)), 
        ])),
        # ('length', FunctionTransformer(textLen)),
        # ('wordvec', FunctionTransformer(meanEmbed, kw_args={'wordVec': wordVec})),
        # ('vader_sentiment', FunctionTransformer(vader_sentiment_features)),
    ])),
    ('clf', LogisticRegression(max_iter=2000))
])

In [15]:
pipeline2 = Pipeline([
    ('features', FeatureUnion([
        ('text', Pipeline([
            ('vect', TfidfVectorizer(ngram_range=(1,1), max_df=0.9, min_df=2, max_features=10000)),
            # ('to_dense', FunctionTransformer(lambda x: x.toarray(), accept_sparse=True)), 
        ])),
        ('length', FunctionTransformer(textLen)),
        ('wordvec', FunctionTransformer(meanEmbed, kw_args={'wordVec': wordVec})),
        ('vader_sentiment', FunctionTransformer(vader_sentiment_features)),
    ])),
    ('clf', LogisticRegression(max_iter=2000))
])

In [29]:
pipeline3 = Pipeline([
    ('nltk_preprocess', nltk_preprocess_transformer),  # Preprocessing step
    ('features', FeatureUnion([
        ('text', Pipeline([
            ('vect', TfidfVectorizer(ngram_range=(1,1), max_df=0.9, min_df=2, max_features=10000)),
        ])),
        ('length', FunctionTransformer(textLen)),
        ('wordvec', FunctionTransformer(meanEmbed, kw_args={'wordVec': wordVec})),
        ('vader_sentiment', FunctionTransformer(vader_sentiment_features)),
    ])),
    ('clf', LogisticRegression(max_iter=2000))
])

In [28]:
pipeline4 = Pipeline([
    ('nltk_preprocess', nltk_preprocess_transformer),  # Preprocessing step
    ('features', FeatureUnion([
        ('text', Pipeline([
            ('vect', TfidfVectorizer(ngram_range=(1,1), max_df=0.9, min_df=2, max_features=30)),
        ])),
        # ('length', FunctionTransformer(textLen)),
        ('wordvec', FunctionTransformer(meanEmbed, kw_args={'wordVec': wordVec})),
        # ('vader_sentiment', FunctionTransformer(vader_sentiment_features)),  # Assuming you've defined this earlier
    ])),
    ('clf', LogisticRegression(max_iter=1000))
])

In [13]:
def train_model(model, X_train, y_train):
    ''' TODO: train your model based on the training data '''
    model.fit(X_train, y_train)

def predict(model, X_test):
    ''' TODO: make your prediction here '''
    return model.predict(X_test)

In [11]:
#textlen + nltk vader + word2vec + tfidf 10000 features
''' load train, val, and test data '''
csv.field_size_limit(999999)
train = pd.read_csv('raw_data/fulltrain.csv', header = None, names=['class','text'])
X_train = train['text']
y_train = train['class']
model = pipeline

train_model(model, X_train, y_train)

# generate prediction on test data
test = pd.read_csv("raw_data/balancedtest.csv", header = None, names=['class','text'])
X_test = test['text']
y_test = test['class']

y_pred = predict(model, X_test)

# Use f1-macro as the metric
score = f1_score(y_test, y_pred, average='macro')
print('score on validation = {}'.format(score))

print('classification report on test data')
print(classification_report(y_test, predict(model, X_test)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


score on validation = 0.7240641520691288
classification report on test data
              precision    recall  f1-score   support

           1       0.85      0.88      0.87       750
           2       0.82      0.43      0.56       750
           3       0.58      0.71      0.64       750
           4       0.76      0.91      0.83       750

    accuracy                           0.73      3000
   macro avg       0.75      0.73      0.72      3000
weighted avg       0.75      0.73      0.72      3000



In [16]:
#3 features + tfidf 30 features
''' load train, val, and test data '''
csv.field_size_limit(999999)
train = pd.read_csv('raw_data/fulltrain.csv', header = None, names=['class','text'])
X_train = train['text']
y_train = train['class']
model = pipeline2

train_model(model, X_train, y_train)

# generate prediction on test data
test = pd.read_csv("raw_data/balancedtest.csv", header = None, names=['class','text'])
X_test = test['text']
y_test = test['class']

y_pred = predict(model, X_test)

# Use f1-macro as the metric
score = f1_score(y_test, y_pred, average='macro')
print('score on validation = {}'.format(score))

print('classification report on test data')
print(classification_report(y_test, predict(model, X_test)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


score on validation = 0.6607373671495416
classification report on test data
              precision    recall  f1-score   support

           1       0.77      0.81      0.79       750
           2       0.67      0.29      0.40       750
           3       0.54      0.82      0.65       750
           4       0.80      0.79      0.80       750

    accuracy                           0.68      3000
   macro avg       0.69      0.68      0.66      3000
weighted avg       0.69      0.68      0.66      3000



In [14]:
#tfidf + LR
''' load train, val, and test data '''
csv.field_size_limit(999999)
train = pd.read_csv('raw_data/fulltrain.csv', header = None, names=['class','text'])
X_train = train['text']
y_train = train['class']
model = pipeline

train_model(model, X_train, y_train)

# generate prediction on test data
test = pd.read_csv("raw_data/balancedtest.csv", header = None, names=['class','text'])
X_test = test['text']
y_test = test['class']

y_pred = predict(model, X_test)

# Use f1-macro as the metric
score = f1_score(y_test, y_pred, average='macro')
print('score on validation = {}'.format(score))

print('classification report on test data')
print(classification_report(y_test, predict(model, X_test)))

score on validation = 0.7211785285833309
classification report on test data
              precision    recall  f1-score   support

           1       0.85      0.80      0.82       750
           2       0.83      0.39      0.53       750
           3       0.57      0.82      0.67       750
           4       0.81      0.92      0.86       750

    accuracy                           0.73      3000
   macro avg       0.76      0.73      0.72      3000
weighted avg       0.76      0.73      0.72      3000



In [23]:
#3 features + POS tagging 
''' load train, val, and test data '''
csv.field_size_limit(999999)
train = pd.read_csv('raw_data/fulltrain.csv', header = None, names=['class','text'])
X_train = train['text']
y_train = train['class']
model = pipeline3

train_model(model, X_train, y_train)

# generate prediction on test data
test = pd.read_csv("raw_data/balancedtest.csv", header = None, names=['class','text'])
X_test = test['text']
y_test = test['class']

y_pred = predict(model, X_test)

# Use f1-macro as the metric
score = f1_score(y_test, y_pred, average='macro')
print('score on validation = {}'.format(score))

print('classification report on test data')
print(classification_report(y_test, predict(model, X_test)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


score on validation = 0.7169600197223034
classification report on test data
              precision    recall  f1-score   support

           1       0.84      0.86      0.85       750
           2       0.81      0.40      0.54       750
           3       0.58      0.76      0.66       750
           4       0.76      0.90      0.83       750

    accuracy                           0.73      3000
   macro avg       0.75      0.73      0.72      3000
weighted avg       0.75      0.73      0.72      3000



In [30]:
#3 features + POS tagging 
''' load train, val, and test data '''
csv.field_size_limit(999999)
train = pd.read_csv('raw_data/fulltrain.csv', header = None, names=['class','text'])
X_train = train['text']
y_train = train['class']
model = pipeline3

train_model(model, X_train, y_train)

# generate prediction on test data
test = pd.read_csv("raw_data/balancedtest.csv", header = None, names=['class','text'])
X_test = test['text']
y_test = test['class']

y_pred = predict(model, X_test)

# Use f1-macro as the metric
score = f1_score(y_test, y_pred, average='macro')
print('score on validation = {}'.format(score))

print('classification report on test data')
print(classification_report(y_test, predict(model, X_test)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


score on validation = 0.7067468465946685
classification report on test data
              precision    recall  f1-score   support

           1       0.84      0.87      0.86       750
           2       0.80      0.41      0.55       750
           3       0.58      0.67      0.62       750
           4       0.72      0.92      0.81       750

    accuracy                           0.72      3000
   macro avg       0.73      0.72      0.71      3000
weighted avg       0.73      0.72      0.71      3000



In [31]:
# wordvec features + POS tagging + tfidf 30
''' load train, val, and test data '''
csv.field_size_limit(999999)
train = pd.read_csv('raw_data/fulltrain.csv', header = None, names=['class','text'])
X_train = train['text']
y_train = train['class']
model = pipeline4

train_model(model, X_train, y_train)

# generate prediction on test data
test = pd.read_csv("raw_data/balancedtest.csv", header = None, names=['class','text'])
X_test = test['text']
y_test = test['class']

y_pred = predict(model, X_test)

# Use f1-macro as the metric
score = f1_score(y_test, y_pred, average='macro')
print('score on validation = {}'.format(score))

print('classification report on test data')
print(classification_report(y_test, predict(model, X_test)))

score on validation = 0.6719208912060579
classification report on test data
              precision    recall  f1-score   support

           1       0.79      0.79      0.79       750
           2       0.74      0.32      0.45       750
           3       0.54      0.78      0.64       750
           4       0.77      0.87      0.81       750

    accuracy                           0.69      3000
   macro avg       0.71      0.69      0.67      3000
weighted avg       0.71      0.69      0.67      3000

