<a href="https://colab.research.google.com/github/yeoyujie/TagTech-Titans/blob/main/CMPSC448.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **1. Data Preprocessing**

### Import all the necessary packages

In [3]:
# Import the necessary libraries

import pandas as pd
import gzip
import spacy

from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack

import numpy as np

import nltk
import matplotlib.pyplot as plt
import seaborn as sns


### Load the data file

In [4]:
with gzip.open('train.txt.gz', 'rt') as f:
    data = f.read()

lines = data.split('\n')

# Splitting each line into its components and removing lines that are empty
lines = [line.split() for line in lines if line]


df = pd.DataFrame(lines, columns=['Token', 'POS tag', 'Chunking tag'])


df = df.drop('Chunking tag', axis=1)


 #### Check for missing and duplicate values.





In [5]:
# Check for missing values
print(df.isnull().sum())

# Drop duplicates
df = df.drop_duplicates()

# If there are missing values in 'Token' or 'POS tag' columns, we drop those rows
df = df.dropna(subset=['Token', 'POS tag'])

Token      0
POS tag    0
dtype: int64


In [6]:
print(df)
df.to_csv('df.txt', index=False, sep='\t')

             Token POS tag
0       Confidence      NN
1               in      IN
2              the      DT
3            pound      NN
4               is     VBZ
...            ...     ...
211688       2,480      CD
211689        cots     NNS
211696       pints     NNS
211698      Type-O      JJ
211715    Huricane     NNP

[20939 rows x 2 columns]


## 2. Split the data and vectorisation

In [7]:
def feature_extraction(token, index, sequence):
    """Extract features for a given token."""
    nltk.download('averaged_perceptron_tagger')

    # If token is empty, return default features
    if not token:
        return {
            'word': '',
            'is_capitalized': False,
            'is_all_caps': False,
            'is_all_lower': False,
            'prefix-1': '',
            'prefix-2': '',
            'prefix-3': '',
            'suffix-1': '',
            'suffix-2': '',
            'suffix-3': '',
            'prev_word': '' if index == 0 else sequence[index - 1],
            'next_word': '' if index == len(sequence) - 1 else sequence[index + 1],
            'has_hyphen': False,
            'is_numeric': False,
            'pos_tag':''
        }

    # Otherwise, return the regular word-based features
    features = {
        'word': token,
        'is_capitalized': token[0].upper() == token[0],
        'is_all_caps': token.upper() == token,
        'is_all_lower': token.lower() == token,
        'prefix-1': token[0],
        'prefix-2': token[:2],
        'prefix-3': token[:3],
        'suffix-1': token[-1],
        'suffix-2': token[-2:],
        'suffix-3': token[-3:],
        'prev_word': '' if index == 0 else sequence[index - 1],
        'next_word': '' if index == len(sequence) - 1 else sequence[index + 1],
        'has_hyphen': '-' in token,
        'is_numeric': token.isdigit(),
        'pos_tag' : nltk.pos_tag([token])[0][1]
    }
    return features



sequence = df['Token'].tolist()


X = [feature_extraction(token, i, sequence) for i, token in enumerate(sequence)]

from sklearn.feature_extraction import DictVectorizer
vectorizer = DictVectorizer(sparse=True)
X = vectorizer.fit_transform(X)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       d

In [8]:
nlp = spacy.load("en_core_web_sm")

# Function to extract syntactic features
def extract_syntactic_features(text):
    doc = nlp(text)
    # Extract relevant syntactic features
    # Example: Get the number of noun phrases
    noun_phrases_count = len(list(doc.noun_chunks))
    return noun_phrases_count

df['syntactic_features'] = df['Token'].apply(extract_syntactic_features)

In [9]:
# Convert the 'syntactic_features' column to a sparse matrix
syntactic_feature_matrix = df['syntactic_features'].values.reshape(-1, 1)

# Horizontally stack the syntactic features with your existing features (X_combined)
X_combined_with_syntactic = hstack((X, syntactic_feature_matrix))

In [10]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['POS tag'])

In [11]:
X_train, X_val, y_train, y_val = train_test_split(X_combined_with_syntactic, y, test_size=0.2, random_state=42)


# **3. Model Training**


## Bayesian Classifier

In [12]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

param_distributions = {
    'alpha': uniform(0.01, 1)
}

nb = MultinomialNB()

random_search = RandomizedSearchCV(nb, param_distributions, n_iter=100, cv=5, n_jobs=-1, random_state=42)

random_search.fit(X_train, y_train)

print("Best hyperparameters: ", random_search.best_params_)

y_val_pred = random_search.best_estimator_.predict(X_val)

print("Accuracy on validation data:", round(accuracy_score(y_val, y_val_pred) * 100, 2), "%")
print(classification_report(y_val, y_val_pred))



Best hyperparameters:  {'alpha': 0.5347564316322378}
Accuracy on validation data: 69.08 %
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         5
           9       0.97      0.98      0.98       378
          10       0.00      0.00      0.00         6
          12       0.00      0.00      0.00         4
          13       0.00      0.00      0.00        28
          14       0.72      0.47      0.57       572
          15       0.00      0.00      0.00        10
          16       0.00      0.00      0.00        11
          17       0.00      0.00      0.00         3
          18       0.54      0.87      0.67       761
          19       0.79      0.93      0.86       864
          20       0.00      0.00      0.00        26
          21       0.73      0.94      0.82       408
          24       0.00      0.00      0.00  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Logistic Regression

In [13]:
model = LogisticRegression(max_iter=1000)

# Define the hyperparameters to tune
param_grid = {
    'penalty': ['l1'],
    'C': [1],
    'solver': ['liblinear']
}

# Use GridSearchCV to tune hyperparameters
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

print("Best parameters: ", grid_search.best_params_)

model = grid_search.best_estimator_
y_val_pred = model.predict(X_val)

print("Accuracy on validation data:", round(accuracy_score(y_val, y_val_pred) * 100, 2), "%")
print(classification_report(y_val, y_val_pred))



Best parameters:  {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
Accuracy on validation data: 74.79 %
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1
           8       1.00      1.00      1.00         5
           9       0.99      1.00      0.99       378
          10       1.00      1.00      1.00         6
          12       0.00      0.00      0.00         4
          13       0.76      0.79      0.77        28
          14       0.74      0.66      0.69       572
          15       0.36      0.40      0.38        10
          16       0.73      1.00      0.85        11
          17       1.00      1.00      1.00         3
          18       0.68      0.77      0.72       761
          19       0.84      0.92      0.87       864
          20       0.17      0.04      0.06        26
          21       0.84      0.88      0.86       408
          23       0.00      0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Support Vector Machines

In [14]:
model = SVC(kernel='linear', probability=True, random_state=42)

# Define the hyperparameters to tune
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [2, 3, 4],
    'gamma': ['scale', 'auto']
}

# Use GridSearchCV to tune hyperparameters
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

print("Best parameters: ", grid_search.best_params_)

model = grid_search.best_estimator_
y_val_pred = model.predict(X_val)

print("Accuracy on validation data:", round(accuracy_score(y_val, y_val_pred) * 100, 2), "%")
print(classification_report(y_val, y_val_pred))




KeyboardInterrupt: ignored

## 4. Model Testing

### Imported unlabeled test file

In [15]:
df_unlabeled = pd.read_csv('unlabeled_test_test.txt', header=None, names=['Token'], sep='\t', error_bad_lines=False, quoting=3, skip_blank_lines=False)

print(df_unlabeled.head())

# Replace NaN values with an empty string
df_unlabeled['Token'].fillna("", inplace=True)


       Token
0  @paulwalk
1         It
2         's
3        the
4       view




  df_unlabeled = pd.read_csv('unlabeled_test_test.txt', header=None, names=['Token'], sep='\t', error_bad_lines=False, quoting=3, skip_blank_lines=False)


In [16]:
sequence_unlabeled = df_unlabeled['Token'].tolist()

X_unlabeled = [feature_extraction(token, i, sequence_unlabeled) for i, token in enumerate(sequence_unlabeled)]

X_unlabeled = vectorizer.transform(X_unlabeled)

df_unlabeled['syntactic_features'] = df_unlabeled['Token'].apply(extract_syntactic_features)

syntactic_feature_matrix_unlabeled = df_unlabeled['syntactic_features'].values.reshape(-1, 1)

X_combined_syn_unlabeled = hstack((X_unlabeled, syntactic_feature_matrix_unlabeled))

y_unlabeled_pred = random_search.best_estimator_.predict(X_combined_syn_unlabeled)

y_unlabeled_pred_tags = label_encoder.inverse_transform(y_unlabeled_pred)

# Create a DataFrame with the tokens and their predicted POS tags
df_test = pd.DataFrame({'Token': df_unlabeled['Token'], 'Predicted POS tag': y_unlabeled_pred_tags})

df_test.to_csv('TagTech Titans.test.txt', sep='\t', index=False, header=False)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       d