# Sentiment Analysis using - Word2Vec model

In [8]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from gensim.models import Word2Vec


In [9]:
# Function to load and preprocess the reviews dataset in chunks
def load_data(json_file_path, text_field, chunk_size=None, percentage_of_sentences=None):
    # Load the JSON data in chunks using pandas
    chunks = pd.read_json(json_file_path, lines=True, chunksize=chunk_size)

    # Initialize empty list to store sentences
    sentences = []

    # Iterate through chunks
    for chunk in chunks:
        # Extract the text field from the DataFrame
        chunk_text = chunk[text_field].values

        # Take only a given percentage of the current chunk if specified
        if percentage_of_sentences is not None:
            assert (percentage_of_sentences > 0 and percentage_of_sentences <= 100)
            
            len_chunk = int(percentage_of_sentences / 100 * len(chunk_text))
            chunk_text = chunk_text[:len_chunk]

        # Tokenize the sentences
        chunk_tokens = [text_to_word_sequence(_) for _ in chunk_text]
        print(chunk_text)
        # Append the tokenized sentences to the list
        sentences.extend(chunk_tokens)
        

    return sentences

In [10]:
# Specify the path to your JSON file and the field containing the review text
json_file_path = r'C:\Users\Administrator\Documents\GitHub\Datasets\yelp_academic_dataset_review.json'
text_field = 'text'
chunk_size = 1000000  # Adjust the chunk size based on your available memory

In [11]:
# Initialize Word2Vec model outside the loop
word2vec = Word2Vec(vector_size=60, min_count=10, window=10)

In [12]:
# Specify the number of epochs you want
epochs = 10

In [13]:
# Load and preprocess the data (20% of sentences)
for epoch in range(epochs):  # Specify the number of epochs you want
    print(f"Training Word2Vec - Epoch {epoch + 1}")
    X_train = load_data(json_file_path, text_field, chunk_size=chunk_size, percentage_of_sentences=20)
    
    # Update the Word2Vec model with new sentences
    word2vec.build_vocab(X_train, update=True)
    word2vec.train(X_train, total_examples=word2vec.corpus_count, epochs=word2vec.epochs)

Training Word2Vec - Epoch 1
["If you decide to eat here, just be aware it is going to take about 2 hours from beginning to end. We have tried it multiple times, because I want to like it! I have been to it's other locations in NJ and never had a bad experience. \n\nThe food is good, but it takes a very long time to come out. The waitstaff is very young, but usually pleasant. We have just had too many experiences where we spent way too long waiting. We usually opt for another diner or restaurant on the weekends, in order to be done quicker."
 "I've taken a lot of spin classes over the years, and nothing compares to the classes at Body Cycle. From the nice, clean space and amazing bikes, to the welcoming and motivating instructors, every class is a top notch work out.\n\nFor anyone who struggles to fit workouts in, the online scheduling system makes it easy to plan ahead (and there's no need to line up way in advanced like many gyms make you do).\n\nThere is no way I can write this revie

["So this place has the best OUTDOOR SEATING in tucson! \n\nThe place has a cute little bar with wooden beams inside. \nDrinks were very tasty and the bartenders were very professional and really knew there stuff (master mixologists).\n\nBEST MUSIC! really supports local artists! We went to see a friend play at this place and the group before him was very lively and and energetic and the soloist after our friend was also nice.\nThe second time i went, was on a saturday night and let me tell you IT WAS INCREDIBLE! 2 Hip hop DJ's were spinning great hits from back in the day and mixed it with some of the better top 40 music! I really enjoyed the crowed and the crowed seemed to enjoy the music. \n\nI have to admit I was a little disappointed with the late night taco menu... but in their defense it is bar food. their fish tacos were kinda blah and the sauces could be more authentic (home-made) it is Tucson after all.\n\nTHE BATHROOM:\nas far as I know their is only 1 bathroom in this estab

KeyboardInterrupt: 

In [None]:
# Save the trained Word2Vec model
word2vec.save("word2vec.model")

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from gensim.models import Word2Vec

# Function to load and preprocess the reviews dataset in chunks
def load_data(json_file_path, text_field, chunk_size=None, percentage_of_sentences=None):
    # Load the JSON data in chunks using pandas
    chunks = pd.read_json(json_file_path, lines=True, chunksize=chunk_size)

    # Initialize empty list to store sentences
    sentences = []

    # Iterate through chunks
    for chunk in chunks:
        # Extract the text field from the DataFrame
        chunk_text = chunk[text_field].values

        # Take only a given percentage of the current chunk if specified
        if percentage_of_sentences is not None:
            assert (percentage_of_sentences > 0 and percentage_of_sentences <= 100)
            
            len_chunk = int(percentage_of_sentences / 100 * len(chunk_text))
            chunk_text = chunk_text[:len_chunk]

        # Tokenize the sentences
        chunk_tokens = [text_to_word_sequence(_) for _ in chunk_text]
        
        # Append the tokenized sentences to the list
        sentences.extend(chunk_tokens)

    return sentences

# Specify the path to your JSON file and the field containing the review text
json_file_path = r'C:\Users\Administrator\Documents\GitHub\Datasets\yelp_academic_dataset_review.json'
text_field = 'text'
chunk_size = 5000  # Adjust the chunk size based on your available memory
# Initialize Word2Vec model outside the loop
word2vec = Word2Vec(vector_size=60, window=10, min_count=10, workers=4)
# Specify the number of epochs you want
epochs = 10
# Load and preprocess the data (20% of sentences)
for epoch in range(epochs):
    print(f"Training Word2Vec - Epoch {epoch + 1}")
    X_train = load_data(json_file_path, text_field, chunk_size=chunk_size, percentage_of_sentences=20)
    
    # Update the Word2Vec model with new sentences
    word2vec.build_vocab(X_train, update=True)
    word2vec.train(X_train, total_examples=word2vec.corpus_count, epochs=word2vec.epochs)

# Save the model
word2vec.save("word2vec_model")


Training Word2Vec - Epoch 1
