#  Word2Vec - Feature Generation Method 2

In [2]:
import nltk
from nltk.corpus import reuters
from nltk.corpus import stopwords
import plotly.express as px
import pandas as pd
from gensim.models import Word2Vec
import umap
import numpy as np
import string
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
%matplotlib inline
from sklearn.manifold import TSNE

## Dataset cleaning

•	We set a threshold of minimum 30 articles per category to be used as our dataset

•	Since, this is a text classification problem, we did further cleaning and preprocessing such as:
1.	HTML Character Replacement
2.	Removing Escape Sequences, such as newline or tab, and replacing it with a single space
3.	Removing punctuation and numbers as they do not provide any useful information
4.	removing commonly used words like 'the', 'and', 'in', etc, 
5.	and finally converts all words to lowercase

•	All this is done using Python built-in libraries and functions, and the Natural Language Toolkit 

•	Once the data is cleaned, it is then added back as a new column labeled “New content”

In [4]:
stop_words = set(stopwords.words('english'))

# Example usage
documents = reuters.fileids()


In [5]:
#this creates a dataframe that splits it into 'category', 'subject' and 'content'
data = []

# Loop over each news article in the Reuters corpus
for article_id in documents:
    # Get the categories, title, and text of the article
    categories = reuters.categories(article_id)
    subject, body = reuters.raw(article_id).split('\n', maxsplit=1)
    if len(categories)>1:
        continue
    # Add a new row for each category
    for category in categories:
        # Store the data in a dictionary
        data.append({'category': category, 'subject': subject, 'content': body})

# Create a DataFrame from the data
reuters_df = pd.DataFrame(data)

# Show a few rows of the DataFrame
reuters_df

Unnamed: 0,category,subject,content
0,trade,ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT,Mounting trade friction between the\n U.S. ...
1,grain,CHINA DAILY SAYS VERMIN EAT 7-12 PCT GRAIN STOCKS,A survey of 19 provinces and seven cities\n ...
2,ship,AUSTRALIAN FOREIGN SHIP BAN ENDS BUT NSW PORTS...,"Tug crews in New South Wales (NSW),\n Victo..."
3,gold,WESTERN MINING TO OPEN NEW GOLD MINE IN AUSTRALIA,Western Mining Corp Holdings Ltd\n &lt;WMNG...
4,acq,SUMITOMO BANK AIMS AT QUICK RECOVERY FROM MERGER,Sumitomo Bank Ltd &lt;SUMI.T> is certain to\...
...,...,...,...
9155,interest,FED SETS TWO BILLION DLR CUSTOMER REPURCHASE,The Federal Reserve entered the U.S.\n gove...
9156,earn,KNIGHT-RIDDER INC &lt;KRN> SETS QUARTERLY,Qtly div 25 cts vs 25 cts prior\n Pay A...
9157,earn,TECHNITROL INC &lt;TNL> SETS QUARTERLY,Qtly div 12 cts vs 12 cts prior\n Pay A...
9158,earn,NATIONWIDE CELLULAR SERVICE INC &lt;NCEL> 4TH QTR,Shr loss six cts vs loss 18 cts\n Net l...


In [5]:
def clean_text(text):    
    # Define a dictionary of html symbols and their replacements
    html_symbols = { '&lt;': '<', '&gt;': '>', '&amp;': '&', '&apos;': '\'', '&quot;': '\"' }
    # Replace html symbols with their corresponding characters
    for symbol, char in html_symbols.items():
        text = text.replace(symbol, char)
    
    # Remove escape sequences from the text
    text = text.replace('\n', ' ')
    text = text.replace('\t', ' ')

    # Replace hyphens with spaces to preserve words
    text = text.replace('-', ' ')

    # Remove punctuation, numbers, and decimal places from the text
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ''.join(char for char in text if not char.isdigit() and char != '.')

    # split the text into words
    words = text.split()
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # convert all text to lowercase
    words = [word.lower() for word in words]
    
    # join the words back into a string
    return ' '.join(words)

# apply the function to the 'text' column of the dataframe
reuters_df['content'] = reuters_df['content'].apply(clean_text)
reuters_df['subject'] = reuters_df['subject'].apply(clean_text)
reuters_df

Unnamed: 0,category,subject,content
0,trade,asian exporters fear damage from us japan rift,mounting trade friction us and japan raised fe...
1,grain,china daily says vermin eat pct grain stocks,a survey provinces seven cities showed vermin ...
2,ship,australian foreign ship ban ends but nsw ports...,tug crews new south wales nsw victoria western...
3,gold,western mining to open new gold mine in australia,western mining corp holdings ltd wmngs wmc sai...
4,acq,sumitomo bank aims at quick recovery from merger,sumitomo bank ltd sumit certain lose status ja...
...,...,...,...
9155,interest,fed sets two billion dlr customer repurchase,the federal reserve entered us government secu...
9156,earn,knight ridder inc krn sets quarterly,qtly div cts vs cts prior pay april record apr...
9157,earn,technitrol inc tnl sets quarterly,qtly div cts vs cts prior pay april record apr...
9158,earn,nationwide cellular service inc ncel th qtr,shr loss six cts vs loss cts net loss vs loss ...


In [6]:
#set minimum threshold for each category
threshold = 30

list_filteredCategory = reuters_df["category"].value_counts()[reuters_df["category"].value_counts()>threshold].index.tolist()
print(f"Number of categories with counts larger than {threshold} : {len(list_filteredCategory)}")
print(reuters_df["category"].value_counts()[reuters_df["category"].value_counts()>threshold])
# group the dataframe by the 'category' column and filter out the groups with counts less than the threshold
reuters_df = reuters_df.groupby('category').filter(lambda x: len(x) >= threshold)
reuters_df = reuters_df.reset_index(drop=True)
reuters_df

Number of categories with counts larger than 30 : 24
category
earn            3923
acq             2292
crude            374
trade            326
money-fx         309
interest         272
money-supply     151
ship             144
sugar            122
coffee           112
gold              90
gnp               74
cpi               71
cocoa             61
grain             51
alum              50
reserves          49
jobs              49
ipi               45
copper            44
rubber            40
iron-steel        38
nat-gas           36
bop               31
Name: count, dtype: int64


Unnamed: 0,category,subject,content
0,trade,asian exporters fear damage from us japan rift,mounting trade friction us and japan raised fe...
1,grain,china daily says vermin eat pct grain stocks,a survey provinces seven cities showed vermin ...
2,ship,australian foreign ship ban ends but nsw ports...,tug crews new south wales nsw victoria western...
3,gold,western mining to open new gold mine in australia,western mining corp holdings ltd wmngs wmc sai...
4,acq,sumitomo bank aims at quick recovery from merger,sumitomo bank ltd sumit certain lose status ja...
...,...,...,...
8779,interest,fed sets two billion dlr customer repurchase,the federal reserve entered us government secu...
8780,earn,knight ridder inc krn sets quarterly,qtly div cts vs cts prior pay april record apr...
8781,earn,technitrol inc tnl sets quarterly,qtly div cts vs cts prior pay april record apr...
8782,earn,nationwide cellular service inc ncel th qtr,shr loss six cts vs loss cts net loss vs loss ...


## Data preprocessing

The below code performs several steps to preprocess the Reuters dataset and then use the Word2Vec model to generate word embeddings, which are then used to convert sentences into vectors. The steps include:

1. **Sentence Splitting**: We split each content of the Reuters dataset into individual words, creating a list of words for each article.

2. **Word2Vec Model Training**: The Word2Vec model is trained on the sentences from the Reuters dataset. The parameters indicate that the context window size is 5 words, a word needs to appear at least 5 times to be considered in the model, and the model uses 4 worker threads to train faster.

3. **Sentence to Vector Conversion**: A function is defined to convert a sentence into a vector by averaging the Word2Vec vectors of its words. If a word is not in the Word2Vec model's vocabulary, it's ignored.

4. **Removing NA Values**: The Reuters dataframe is cleaned to remove any rows with NA values.

5. **Vector Conversion and Dimensionality Reduction**: The 'vector' column of the dataframe is converted into a numpy array, and t-SNE (t-Distributed Stochastic Neighbor Embedding) is used to reduce the dimensionality of these vectors to 3 components, making them easier to visualize and work with.

Through the below code, the Reuters news articles are transformed from text data into numerical vectors that can be used as input for machine learning models.

In [7]:
sentences = [sentence.split() for sentence in reuters_df['content']]
model = Word2Vec(sentences, window=5, min_count=5, workers=4)

In [8]:
# Define a function to convert a sentence to a vector
def sentence_to_vec(sentence):
    words = sentence.split()
    vectors = [model.wv[word] for word in words if word in model.wv]
    if len(vectors) == 0:
        return None
    return sum(vectors) / len(vectors)

In [9]:
# Apply the sentence_to_vec function to each row in the DataFrame
reuters_df['vector'] = reuters_df['content'].apply(sentence_to_vec)

In [10]:
reuters_df["vector"][0]

array([ 4.96033626e-03,  1.88297749e-01,  1.86833233e-01,  9.61270109e-02,
        3.61985415e-02, -3.14277470e-01,  7.22549558e-02,  2.72226870e-01,
       -5.12059808e-01,  8.98034200e-02, -2.22922310e-01, -5.07533610e-01,
        5.66791147e-02,  4.32990119e-03, -2.92481571e-01, -1.41896801e-02,
        2.48342007e-01, -2.93785781e-01, -2.21010551e-01, -5.92373729e-01,
        9.81035084e-02,  6.37945160e-02,  2.28297636e-01, -1.81612536e-01,
       -1.06410585e-01, -9.86280963e-02, -4.66109335e-01, -5.06145895e-01,
       -4.08262402e-01,  1.20098874e-01,  3.47252697e-01,  1.71071678e-01,
        2.59000659e-01, -3.80822659e-01,  2.21723169e-01,  1.49693951e-01,
        2.05135465e-01, -3.08067888e-01, -2.89795816e-01, -5.84835351e-01,
       -1.54967979e-01, -3.41306865e-01, -2.08714902e-01, -1.08404160e-01,
       -1.92283630e-01, -1.51312575e-01, -2.58270979e-01,  1.68104693e-02,
       -3.32883783e-02,  2.97623277e-01, -2.78882772e-01, -3.94193411e-01,
       -2.71376818e-01, -

In [11]:
reuters_df = reuters_df.dropna()

vectors = np.array(reuters_df['vector'].tolist())
# Fit t-SNE model to the vectors
tsne = TSNE(n_components=3, random_state=0)
vectors_tsne = tsne.fit_transform(vectors)

## Feature visualisation through t-SNE

After generating word embeddings using Word2Vec, we visualise these high-dimensional vectors to gain insights into their structure.

In [12]:
# Generate a list of colors for the categories
color_list = px.colors.qualitative.Plotly

# Create plotly figure
fig = px.scatter_3d(reuters_df, x=vectors_tsne[:, 0], y=vectors_tsne[:, 1], z=vectors_tsne[:, 2], color='category', color_discrete_sequence=color_list)

# Set the size of the dots
dot_size = 2
fig.update_traces(marker=dict(size=dot_size, opacity=0.8))

# Set the title and axis labels
fig.update_layout(title='Word2Vec 3D Visualization', scene=dict(xaxis_title='Dim 1', yaxis_title='Dim 2', zaxis_title='Dim 3'))

# Show plot
fig.write_html("TNSE_Word2Vec.html")
fig.show()

## Alternative method (UMAP) to visualise the feature space

UMAP follows a similar concept to t-SNE for visualizing high-dimensional data in lower-dimensional space (usually 2D or 3D for visualization purposes).

In [12]:
# perform dimensionality reduction using UMAP
reducer = umap.UMAP(n_neighbors=10, min_dist=0.1, metric='euclidean', n_components=3)
X_umap = reducer.fit_transform(vectors)

In [13]:
# Generate a list of colors for the categories
color_list = px.colors.qualitative.Plotly

# Create plotly figure
fig = px.scatter_3d(reuters_df, x=X_umap[:, 0], y=X_umap[:, 1], z=X_umap[:, 2], color='category', color_discrete_sequence=color_list)

# Set the size of the dots
dot_size = 2
fig.update_traces(marker=dict(size=dot_size, opacity=0.8))

# Set the title and axis labels
fig.update_layout(title='UMAP 3D Visualization', scene=dict(xaxis_title='UMAP Dimension 1', yaxis_title='UMAP Dimension 2', zaxis_title='UMAP Dimension 3'))

# Show plot
fig.show()

#### 