# Prompt Data Analysis Tools
Thomas Mattson

Westmont College - CS-195 Senior Seminar

April 29, 2023

## Read Prompt Data

In [3]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import PorterStemmer

# Load the CSV file into a pandas DataFrame
df = pd.read_csv('PromptsToAnalyze.csv')

# Create an instance of the PorterStemmer
stemmer = PorterStemmer()

# Function to tokenize the text
def tokenize_text(text):
    return word_tokenize(text)

# Function to remove adjectives and stopwords
def remove_adjectives_stopwords(tokenized_text):
    pos_tagged_text = pos_tag(tokenized_text)
    stop_words = set(stopwords.words('english'))
    return [word.lower() for word, tag in pos_tagged_text if tag not in ['JJ', 'JJR', 'JJS'] and word.lower() not in stop_words]

# Function to stem the words
def stem_tokens(tokenized_text):
    return [stemmer.stem(word) for word in tokenized_text]

# Apply tokenization and adjective removal to the 'Prompt' column
df['Tokenized Text'] = df['Prompt'].apply(tokenize_text)
df['Tokenized Text'] = df['Tokenized Text'].apply(remove_adjectives_stopwords)

# Create a new column 'Adjectives Removed' by removing adjectives and stopwords from 'Tokenized Text'
df['Adjectives Removed'] = df['Tokenized Text'].apply(remove_adjectives_stopwords)

# Create a new column 'Stemmed Tokens' by stemming the words in 'Tokenized Text'
df['Stemmed Tokens'] = df['Tokenized Text'].apply(stem_tokens)

# Print the first few rows of the DataFrame to check that it loaded correctly
print(df.head())


     Channel                            ID  \
0  newbies 3    563,806,429,635,543,000.00   
1  newbies 3    775,578,578,561,531,000.00   
2  newbies 3  1,089,046,680,424,410,000.00   
3  newbies 3    456,383,955,697,008,000.00   
4  newbies 3  1,016,095,568,788,990,000.00   

                                              Prompt       Date  \
0  an indian tech nerd dressed like Tom Cruise in...  3/29/2023   
1  art for podcast named Taste Buds that is hoste...  3/29/2023   
2  8k photo realistic great white shark on beach ...  3/29/2023   
3  wide angle full body shot  group of agressive ...  3/29/2023   
4                   artwork by michel granger skull   3/29/2023   

                                      Tokenized Text  \
0  [tech, nerd, dressed, like, tom, cruise, actio...   
1  [art, podcast, named, taste, buds, hosted, two...   
2  [8k, photo, shark, beach, teeth, eyes, colors,...   
3  [angle, body, group, cyberpunk, 2077, npcs, 6t...   
4                  [artwork, michel, gra

In [4]:
df.to_csv('tokens.csv', index=False)

## Sentiment Analysis using NLTK

### Sentiment Calculation

In [None]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import pandas as pd

# Download necessary resources
nltk.download('vader_lexicon')

# Initialize SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# Perform sentiment analysis for each prompt
sentiment_scores = []
for prompt in prompts:
    scores = sia.polarity_scores(prompt)
    sentiment_scores.append(scores)

# Create a new DataFrame with the sentiment scores and prompts
sentimentAnalysis_df = pd.DataFrame(sentiment_scores)
sentimentAnalysis_df.columns = ['Negative', 'Neutral', 'Positive', 'Compound']
sentimentAnalysis_df['Prompt'] = prompts  # Add the 'Prompt' column
sentimentAnalysis_df = sentimentAnalysis_df[['Prompt', 'Negative', 'Neutral', 'Positive', 'Compound']]  # Reorder columns

print(sentimentAnalysis_df.head())


### Sentiment Visualization

In [None]:
import matplotlib.pyplot as plt

# Plotting a pie chart of the sentiment distribution
sizes = sentimentAnalysis_df[['Positive', 'Negative', 'Neutral']].mean()
plt.figure(figsize=(6, 6))
labels = ['Positive', 'Negative', 'Neutral']
colors = ['#66b3ff', '#ff9999', '#99ff99']
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
plt.title('Sentiment Analysis')
plt.axis('equal')
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Assuming you have a DataFrame named 'sentimentAnalysis_df' with a column named 'Compound'
# Extract the compound scores
compound_scores = sentimentAnalysis_df['Compound']

# Create a histogram
plt.hist(compound_scores, bins=10, edgecolor='black')

# Add labels and title
plt.xlabel('Compound Score')
plt.ylabel('Frequency')
plt.title('Histogram of Compound Scores')

# Display the histogram
plt.show()


### Sentiment Output

In [None]:
# Assuming you have the 'output_data' DataFrame

# Define the output file path
sentiment_output_file = 'sentiment_analysis.csv'

# Write the output DataFrame to a new CSV file
sentimentAnalysis_df.to_csv(sentiment_output_file, index=False)
print(f"Sentiment analysis saved to {sentiment_output_file}.")

## Word Frequency

### Frequency Processing

In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter

# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')

# Tokenize the prompts, remove stopwords, and count word frequencies
tokenized_words = []
for rows in df['Tokenized Text']:
    tokens = [word.lower() for word in tokens]
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in tokens if word not in stop_words]
    tokenized_words.extend(filtered_words)

word_freq = Counter(tokenized_words)

# Sort word frequencies in descending order
sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)

# Create a DataFrame for word frequencies
wordFrequency_df = pd.DataFrame(sorted_word_freq, columns=['Word', 'Frequency'])

wordFrequency_df.head()

### Frequency Without Adjectives

In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from collections import Counter

# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

# Tokenize the prompts, remove stopwords, and count word frequencies
tokenized_words = []
for prompt in prompts:
    tokenized_text = [word.lower() for word in word_tokenize(prompt)]
    stop_words = set(stopwords.words('english'))
    pos_tagged_text = pos_tag(tokenized_text)
    filtered_words = [word for word, tag in pos_tagged_text if tag not in ['JJ', 'JJR', 'JJS'] and word not in stop_words]
    tokenized_words.extend(filtered_words)

word_freq = Counter(tokenized_words)

# Sort word frequencies in descending order
sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)

# Create a DataFrame for word frequencies
wordFrequency_noAdjectives_df = pd.DataFrame(sorted_word_freq, columns=['Word', 'Frequency'])

wordFrequency_noAdjectives_df.head()

### Frequency Analysis

In [None]:
top_10_words = wordFrequency_df.head(10)

# Create a bar plot of the word frequencies
plt.figure(figsize=(10, 6))
plt.bar(top_10_words['Word'], top_10_words['Frequency'])
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Top 10 Most Frequent Words')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
top_10_nonAdjectives = wordFrequency_noAdjectives_df.head(10)

# Create a bar plot of the word frequencies
plt.figure(figsize=(10, 6))
plt.bar(top_10_words['Word'], top_10_words['Frequency'])
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Top 10 Most Frequent Words (No Adjectives)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
top_25_words = wordFrequency_df.head(25)

plt.figure(figsize=(10, 6))
plt.scatter(top_25_words['Frequency'], top_25_words['Word'], s=100, c='b', alpha=0.7)
plt.xlabel('Frequency')
plt.ylabel('Words')
plt.title('Word Frequencies')
plt.grid(True)
plt.tight_layout()
plt.show()

### Frequency Output to File

In [None]:
# Define the output file path
output_file = 'word_frequencies1.csv'

# Save the DataFrame to CSV
df.to_csv(output_file, index=False)

print(f"Word frequencies saved to {output_file}.")

# Natural Language Processing

In [None]:
import nltk
import gensim
from gensim import corpora
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import pandas as pd
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import csv
from nltk import pos_tag
from nltk.corpus import wordnet
import re
nltk.download('averaged_perceptron_tagger')

stemmer = PorterStemmer()

def replace_words(text, replacements):
    pattern = re.compile(r'\b(?:%s)\b' % '|'.join(replacements), re.IGNORECASE)
    return re.sub(pattern, lambda match: replacements[match.group(0).lower()], text)

# Define the replacements
replacements = {
    'girl': 'woman',
    'female': 'woman',
    'boy': 'man',
    'male': 'man'
}

# Apply the replacements and tokenize the modified text
adjectiveFree_df = df.copy()
adjectiveFree_df['Prompt'] = adjectiveFree_df['Prompt'].apply(replace_words, replacements=replacements)

# Tokenize the text
adjectiveFree_df['Tokenized_Text'] = adjectiveFree_df['Prompt'].apply(word_tokenize)

'''toRemoveWords = ['red', 'orange', 'yellow', 'green', 
                 'blue', 'purple', 'black', 'white', 
                 'grey', 'gray',
                 'style', 'background', 'dark', 'lighting']
'''
              
# Remove stopwords, punctuation, and color names
# stop_words = set(stopwords.words('english') + list(string.punctuation) + toRemoveWords)
stop_words = set(stopwords.words('english') + list(string.punctuation))

# Removing Adjectives
def remove_adjectives(pos_tagged_text):
    filtered_text = []
    for word, tag in pos_tagged_text:
        if tag not in ['JJ', 'JJR', 'JJS'] and word.lower() not in stop_words:
            filtered_text.append(word.lower())
    return filtered_text

adjectiveFree_df['POS_Tagged_Text'] = adjectiveFree_df['Tokenized_Text'].apply(pos_tag)
adjectiveFree_df['Filtered_Text'] = adjectiveFree_df['POS_Tagged_Text'].apply(remove_adjectives)

print(adjectiveFree_df.head())