In [1]:
import json
import pandas as pd
import numpy as np
import os

from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
import gensim.downloader

from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

from nltk.tokenize import word_tokenize,RegexpTokenizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


### Settings

In [3]:
# Set directories of main dataset and metadata
directory_main_train = 'gdrive/My Drive/Colab Notebooks/ddp/binary/binary_train.csv'
directory_main_val = 'gdrive/My Drive/Colab Notebooks/ddp/binary/binary_val.csv'
directory_main_full = 'gdrive/My Drive/Colab Notebooks/ddp/binary/binary_full.csv'

# Should the model be saved?
save_model = False
model_name = "test"

### Load the data, filter on English podcasts and insert into dataframe

In [4]:
# Function that removes punctuation, lowercases everything (to normalize), tokenizes, and converts the labels to int
def clean_data(df):
    tokenizer = RegexpTokenizer(r'\w+')
    df['text'] = df['text'].str.lower()
    df['text_tokenized'] = df['text'].apply(tokenizer.tokenize)
    return df

In [5]:
df_train = pd.read_csv(directory_main_train,sep='\t')

# Normalize and clean text
df_train = clean_data(df_train)
text = df_train['text_tokenized'].values

optimized_terms = Phraser(Phrases(text, min_count=2))
text_final = optimized_terms[text]

In [6]:
# Build a word2vec model using the vocabulary
modelw2v = Word2Vec(text_final,size=300)

modelw2v.build_vocab(text_final, update=True)
print("Vocab building done!")

modelw2v.train(text_final, total_examples=modelw2v.corpus_count, epochs=30)
print("Training done!")

if save_model:
    model_format = model_name + ".model"

    # Save the current model for use later
    modelw2v.save(model_format)

    # Load the model to use now
    word_vectors = Word2Vec.load(model_format).wv
else:
    word_vectors = modelw2v.wv


# Initiate the K-means algorithm and find n clusters
model = KMeans(n_clusters=2, max_iter=10000, random_state=True, n_init=1000).fit(X=word_vectors.vectors.astype('double'))
print('KMeans model ready!')

Vocab building done!
Training done!
KMeans model ready!


In [11]:
print(word_vectors.similar_by_vector(model.cluster_centers_[0], topn=10, restrict_vocab=None))
print(word_vectors.similar_by_vector(model.cluster_centers_[1], topn=10, restrict_vocab=None))

[('bethenny', 0.6718787550926208), ('guilty', 0.6651849150657654), ('uncomfortable', 0.6538792848587036), ('loyal', 0.6524480581283569), ('say_anything', 0.6517428755760193), ('focused', 0.6376308798789978), ('cowboy_hat', 0.6332029104232788), ('unbelievable', 0.6292005777359009), ('laugh', 0.6271635293960571), ('frank', 0.6247174143791199)]
[('combat', 0.8252910375595093), ('free_agency', 0.8121446371078491), ('team_rules', 0.8059823513031006), ('previous', 0.8020377159118652), ('creatures', 0.8013936281204224), ('rate', 0.7976237535476685), ('our_community', 0.7950810194015503), ('kicks', 0.7943021059036255), ('chompers', 0.793852686882019), ('uk', 0.793601930141449)]


In [12]:
# Set the cluster positions
positive_cluster_index = 0
positive_cluster_center = model.cluster_centers_[positive_cluster_index]
negative_cluster_center = model.cluster_centers_[1-positive_cluster_index]

In [13]:
def create_df_dict():

    # Create vectors for each word
    metrics_df = pd.DataFrame(word_vectors.vocab.keys())
    metrics_df.columns = ['words']

    # Assign words to a cluster using Sklearn's predict
    metrics_df['vectors'] = metrics_df['words'].apply(lambda x: word_vectors[f'{x}'])
    metrics_df['cluster'] = metrics_df['vectors'].apply(lambda x: model.predict([np.array(x)]))

    # Unpack the values from list
    metrics_df['cluster'] = metrics_df['cluster'].apply(lambda x: x[0])

    # Assign words to cluster
    metrics_df['cluster_value'] = [1 if i==positive_cluster_index else -1 for i in metrics_df['cluster']]

    # Assign the inverse distance to the closest cluster to each word
    metrics_df['distance'] = metrics_df.apply(lambda x: 1/(model.transform([x['vectors']]).min()), axis=1)

    # Calculate the sentiment coefficient
    metrics_df['sentiment_coeff'] = metrics_df['distance'] * metrics_df['cluster_value']

    sentiment_dict = dict(zip(metrics_df['words'].values, metrics_df['sentiment_coeff'].values))

    return metrics_df, sentiment_dict

### ---Tf-idf weighting---

In [14]:
# Load in the validation set and clean like the training set
df_val = pd.read_csv(directory_main_val,sep='\t')
df_val = clean_data(df_val)

In [15]:
# Vectorize the sequences
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(df_val['text'])

# Get the names of each feature
features = pd.Series(tfidf.get_feature_names())

# Transform the text into their respective TF-IDF values
transformed = tfidf.transform(df_val['text'])



In [18]:
metrics_df, sentiment_dict = create_df_dict()

# Create a dictionary of every word and its corresponding TF-IDF value
def create_tfidf_dictionary(x, transformed_file, features):
    vector_coo = transformed_file[x.name].tocoo()
    vector_coo.col = features.iloc[vector_coo.col].values
    return dict(zip(vector_coo.col, vector_coo.data))

def replace_tfidf_words(x, transformed_file, features):
    dictionary = create_tfidf_dictionary(x, transformed_file, features)   
    return list(map(lambda y:dictionary[f'{y}'], x['text'].split()))

# Replaces a word with its respective sentiment value
def replace_sentiment_words(word, sentiment_dict):
    try:
        return sentiment_dict[word]
    except KeyError:
        return 0

replaced_tfidf_scores = df_val.apply(lambda x: replace_tfidf_words(x, transformed, features), axis=1)
replaced_closeness_scores = df_val['text'].apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), x.split())))

# Create new dataframe for final calculations
df_kmeans = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, df_val['text'], df_val['sentiment_score']]).T
df_kmeans.columns = ['sentiment_coeff', 'tfidf_scores', 'sentence', 'sentiment_score']

# Take the dot product to determine if a segment is mostly positive or mostly negative
df_kmeans['prediction'] = df_kmeans.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)

# Predict the label and convert to the same datatype
df_kmeans['prediction'] = (df_kmeans['prediction']>=0).astype('int8')
df_kmeans['sentiment_score'] = df_kmeans['sentiment_score'].astype('int8')

### ---Performance Metrics---

In [None]:
y_true_kmeans = df_kmeans['sentiment_score']
y_pred_kmeans = df_kmeans['prediction']

# Display the final scores
print('Confusion Matrix\n',confusion_matrix(y_true_kmeans,y_pred_kmeans))
print(classification_report(y_true_kmeans, y_pred_kmeans))

Confusion Matrix
 [[172 233]
 [447 440]]
              precision    recall  f1-score   support

           0       0.28      0.42      0.34       405
           1       0.65      0.50      0.56       887

    accuracy                           0.47      1292
   macro avg       0.47      0.46      0.45      1292
weighted avg       0.54      0.47      0.49      1292

