In [1]:
import warnings
warnings.filterwarnings("ignore")
raw_txt = []
filename = "user1.txt"
# Opening the file and reading its contents
with open(filename, 'r', encoding='utf-8') as file:
    # Read the contents of the file into a string
    file_contents = file.read()
    # Append the string to the list
    raw_txt.append(file_contents)
# Now, raw_txt contains the contents of "user1.txt" as a string

In [2]:
print(raw_txt)

["This is my first day writing this personal diary of mine. I plan to share all my secrets with you (the diary). I hope to carry out deep conversations with you. I'm rooting for you to give me valuable advices along the way."]


In [4]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re

# Downloading NLTK resources
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

# Initialize stemming and lemmatization
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Initialize stopwords
stop_words = set(stopwords.words('english'))

# Preprocessing steps
clean_txt = []
for text in raw_txt:
    # Lowercasing
    text = text.lower()
    # Tokenization
    tokens = word_tokenize(text)
    # Removing Punctuation, Stopwords, and Numbers
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    # Stemming and Lemmatization
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Removing Special Characters
    cleaned_tokens = [re.sub(r'[^a-zA-Z0-9\s]', '', token) for token in tokens]
    # Removing Extra Whitespace
    cleaned_tokens = [token.strip() for token in cleaned_tokens if token.strip()]
    # Append preprocessed text to the result
    clean_txt.append(cleaned_tokens)
# Now, clean_txt contains the preprocessed text data
print(clean_txt)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


[['first', 'day', 'writing', 'personal', 'diary', 'mine', 'plan', 'share', 'secrets', 'diary', 'hope', 'carry', 'deep', 'conversations', 'rooting', 'give', 'valuable', 'advices', 'along', 'way']]


In [5]:
from gensim.models import Word2Vec

# Initialize and train Word2Vec model
word2vec_model = Word2Vec(sentences=clean_txt, vector_size=100, window=5, min_count=1, sg=0)
# Get the word vector for a specific word (e.g., 'writing')
word_vector = word2vec_model.wv['writing']
# Print the word vector
print("Word vector for 'writing':", word_vector)

Word vector for 'writing': [-0.00714456  0.00124695 -0.00717579 -0.00224254  0.0037249   0.00583441
  0.00120493  0.00210188 -0.00411531  0.00722782 -0.00630779  0.00464432
 -0.00821532  0.00203472 -0.0049721  -0.00425047 -0.00311208  0.00565532
  0.00579809 -0.00497918  0.00077567 -0.00849458  0.00781444  0.00925513
 -0.00274215  0.00080066  0.00074497  0.00547326 -0.00860741  0.00058077
  0.00687346  0.00223033  0.0011257  -0.00932056  0.00848539 -0.00626156
 -0.0029961   0.00349954 -0.00076982  0.00140666  0.00178326 -0.0068309
 -0.00972542  0.00904331  0.00619928 -0.00691643  0.00340358  0.00020723
  0.00475327 -0.00712293  0.00402952  0.00435081  0.00996157 -0.00447783
 -0.00138544 -0.00732011 -0.00969798 -0.00908144 -0.00102043 -0.00650703
  0.00484654 -0.00616875  0.0025223   0.00073932 -0.0033957  -0.00097949
  0.00997966  0.00915139 -0.00446533  0.00908463 -0.00564671  0.00593114
 -0.00309437  0.00343803  0.00301617  0.00690075 -0.00237569  0.00877494
  0.00758841 -0.0095488  