In [1]:
import pandas as pd
from collections import Counter
import string


def preprocess_text(text):
    """Tokenize and preprocess text by splitting and removing punctuation."""
    if isinstance(text, str):
        return [word.strip(string.punctuation).lower() for word in text.split() if word.isalpha()]
    return []


def analyze_keywords(input_file):
    # Read the dataset
    df = pd.read_csv(input_file)

    # Ensure the 'processed_text' column exists
    if 'processed_text' not in df.columns:
        raise ValueError(
            "The dataset must include a 'processed_text' column with raw text.")

    # Preprocess the 'processed_text' column to tokenize it into lists
    df['processed_text'] = df['processed_text'].apply(preprocess_text)

    # Separate the data by sentiment
    positive_reviews = df[df['sentiment'] == 'positive']['processed_text']
    negative_reviews = df[df['sentiment'] == 'negative']['processed_text']

    # Flatten the tokenized reviews into a single list of words
    positive_words = [word for review in positive_reviews for word in review]
    negative_words = [word for review in negative_reviews for word in review]

    # Count word frequencies
    positive_word_counts = Counter(positive_words)
    negative_word_counts = Counter(negative_words)

    # Get the top 10 most frequent words
    top_positive_keywords = positive_word_counts.most_common(10)
    top_negative_keywords = negative_word_counts.most_common(10)

    # Display the results
    print("\nTop 10 Positive Keywords:")
    print(pd.DataFrame(top_positive_keywords,
          columns=["Keyword", "Frequency"]))

    print("\nTop 10 Negative Keywords:")
    print(pd.DataFrame(top_negative_keywords,
          columns=["Keyword", "Frequency"]))


if __name__ == "__main__":
    # Replace with your file path
    input_file = "../data/processed/cleaned_sentiment_analysis_reviews.csv"
    analyze_keywords(input_file)


Top 10 Positive Keywords:
  Keyword  Frequency
0     not       2742
1      so       1987
2    very       1803
3   great       1759
4     one       1645
5     use       1609
6     can       1595
7    like       1500
8     all       1458
9    good       1397

Top 10 Negative Keywords:
  Keyword  Frequency
0     not        412
1     one        175
2      so        168
3      no        142
4     get        129
5    very        127
6    when        126
7    will        120
8   would        117
9     use        112


In [2]:
import pandas as pd
from collections import Counter
from itertools import tee

def tokenize_simple(text):
    """Tokenize text by splitting on spaces."""
    if isinstance(text, str):
        return [word.lower().strip() for word in text.split() if word.isalpha()]
    return []


def generate_bigrams(tokens):
    """Generate bigrams from a list of tokens."""
    a, b = tee(tokens)
    next(b, None)
    return list(zip(a, b))


def analyze_not_usage(input_file):
    # Read the dataset
    df = pd.read_csv(input_file)

    # Ensure the 'processed_text' column exists
    if 'processed_text' not in df.columns:
        raise ValueError(
            "The dataset must include a 'processed_text' column with raw text.")

    # Preprocess the 'processed_text' column to tokenize it into lists
    df['processed_text'] = df['processed_text'].apply(tokenize_simple)

    # Filter positive reviews containing "not"
    positive_reviews = df[df['sentiment'] == 'positive']['processed_text']
    positive_reviews_with_not = positive_reviews[positive_reviews.apply(
        lambda review: 'not' in review)]

    # Extract bigrams for reviews containing "not"
    bigram_counts = Counter()
    for review in positive_reviews_with_not:
        bigrams = generate_bigrams(review)  # Generate bigrams
        bigram_counts.update(bigrams)

    # Filter bigrams where "not" is the first word
    not_bigrams = {bigram: count for bigram,
                   count in bigram_counts.items() if bigram[0] == 'not'}

    # Sort and get the top 10 most common "not" bigrams
    sorted_not_bigrams = sorted(
        not_bigrams.items(), key=lambda x: x[1], reverse=True)[:10]

    # Convert to DataFrame for better visualization
    not_bigrams_df = pd.DataFrame(
        sorted_not_bigrams, columns=["Bigram", "Frequency"])

    # Display results
    print("\nTop 10 'Not' Bigrams in Positive Reviews:")
    print(not_bigrams_df)


# Run the analysis
# Replace with your file path
input_file = "../data/processed/cleaned_sentiment_analysis_reviews.csv"
analyze_not_usage(input_file)


Top 10 'Not' Bigrams in Positive Reviews:
        Bigram  Frequency
0  (not, sure)         74
1  (not, work)         51
2   (not, too)         43
3  (not, only)         41
4   (not, bad)         40
5   (not, use)         36
6  (not, like)         35
7  (not, much)         32
8   (not, big)         31
9  (not, good)         28


In [3]:
import pandas as pd


def extract_phrases_with_bigram(input_file, bigram_to_search):
    # Read the dataset
    df = pd.read_csv(input_file)

    # Ensure the 'processed_text' column exists
    if 'processed_text' not in df.columns:
        raise ValueError(
            "The dataset must include a 'processed_text' column with raw text.")

    # Ensure the raw review text column exists for context
    if 'reviewText' not in df.columns:
        raise ValueError(
            "The dataset must include a 'reviewText' column for full context.")

    # Tokenize the processed_text column
    df['processed_text'] = df['processed_text'].apply(
        lambda x: x.lower().split() if isinstance(x, str) else [])

    # Filter reviews that contain the bigram
    bigram_reviews = df[df['processed_text'].apply(
        lambda tokens: any(tokens[i:i+2] == list(bigram_to_search)
                           for i in range(len(tokens)-1))
    )]

    # Extract the context for each review
    contexts = []
    for _, row in bigram_reviews.iterrows():
        review_text = row['reviewText']
        if isinstance(review_text, str) and " ".join(bigram_to_search) in review_text.lower():
            contexts.append(review_text)

    # Display the sentences containing the bigram
    print(f"\nSentences containing the bigram '{' '.join(bigram_to_search)}':")
    for context in contexts[:10]:  # Limit to 10 examples for clarity
        print(f"- {context.strip()}")


if __name__ == "__main__":
    # Replace with your file path
    input_file = "../data/processed/cleaned_sentiment_analysis_reviews.csv"
    bigram_to_search = ("not", "bad")  # Specify the bigram to search
    extract_phrases_with_bigram(input_file, bigram_to_search)


Sentences containing the bigram 'not bad':
- Got this unit real fast and came in a box wrapped in plastic wrap lol. But hey was all there and not damaged. Easy to assemble and looks and feels great, I had a couple minor issues with this thing you can view on the youtube page if you like to see it. But overall not bad!  http://youtu.be/4RoQHzo-C9k?list=UUZlPl1F9XVcDu12wZ7L1pJw
- I have used many headphones before, so I have a very good idea of what headphones need to sound like.Plantronics is a great headphones:Sound quality is great the bass is very good for a small Bluetooth headphones.Fit factor: The headphones fits very nicely and snugly I do very intensive workouts with it.Battery life: is not the best but it is amazing comparing to the size of the headphones. (The charging case is a great edition to make sure your headphones are charged while in your bag)Connectivity: it is easy to connect and pair with your phone, I love the ability to pair to 8 devices.After all:It is a very go