In [3]:
import pandas as pd

# Load the CSV file into a pandas DataFrame
df = pd.read_csv('PromptsToAnalyze.csv')
prompts = df['Prompt']

# Print the first few rows of the DataFrame to check that it loaded correctly
print(df.head())

     Channel                            ID  \
0   newbies3    563,806,429,635,543,000.00   
1  newbies-3    775,578,578,561,531,000.00   
2  newbies-3  1,089,046,680,424,410,000.00   
3  newbies-3    456,383,955,697,008,000.00   
4  newbies-3  1,016,095,568,788,990,000.00   

                                              Prompt       Date  
0  an indian tech nerd dressed like Tom Cruise in...  3/29/2023  
1  art for podcast named Taste Buds that is hoste...  3/29/2023  
2  8k photo realistic great white shark on beach,...  3/29/2023  
3  wide angle full body shot, group of agressive ...  3/29/2023  
4                   artwork by michel granger skull-  3/29/2023  


In [None]:
import nltk
import pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer

# Download necessary resources
nltk.download('vader_lexicon')

# Initialize SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# Perform sentiment analysis for each prompt
sentiment_scores = []
for prompt in prompts:
    scores = sia.polarity_scores(prompt)
    sentiment_scores.append(scores)

# Create a new DataFrame with the sentiment scores
output_data = df.copy()
output_data['Positive'] = [score['pos'] for score in sentiment_scores]
output_data['Negative'] = [score['neg'] for score in sentiment_scores]
output_data['Neutral'] = [score['neu'] for score in sentiment_scores]
output_data['Compound'] = [score['compound'] for score in sentiment_scores]

# Write the output DataFrame to a new CSV file
output_data.to_csv('your_output_file.csv', index=False)


In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import csv
import pandas as pd

# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')

# Tokenize the prompts, remove stopwords, and count word frequencies
tokenized_words = []
for prompt in prompts:
    tokenized_text = [word.lower() for word in word_tokenize(prompt)]
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in tokenized_text if word not in stop_words]
    tokenized_words.extend(filtered_words)

word_freq = Counter(tokenized_words)

# Sort word frequencies in descending order
sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)

# Define the output file path
output_file = 'word_frequencies.csv'

# Open the CSV file for writing
with open(output_file, 'w', newline='') as file:
    writer = csv.writer(file)

    # Write the header row
    writer.writerow(['Word', 'Frequency'])

    # Write the word frequencies to the CSV file
    for word, freq in sorted_word_freq:
        writer.writerow([word, freq])

print(f"Word frequencies saved to {output_file}.")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tdmat\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tdmat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Word frequencies saved to word_frequencies.csv.


In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Compute the TF-IDF matrix for the prompts
tfidf_matrix = vectorizer.fit_transform(prompts)

# Compute the cosine similarity matrix
cosine_similarities = cosine_similarity(tfidf_matrix)

# Define a threshold for similarity score
threshold = 0.8

# List to store the indices of prompts to be removed
indices_to_remove = []

# Iterate over each pair of prompts
for i in range(len(prompts)):
    for j in range(i + 1, len(prompts)):
        # If the cosine similarity score is above the threshold, mark the second prompt for removal
        if cosine_similarities[i, j] > threshold:
            indices_to_remove.append(j)

# Remove the similar prompts from the DataFrame
df_filtered = df.drop(indices_to_remove)

# Save the filtered prompts to a new CSV file
output_file = 'filtered_prompts.csv'
df_filtered.to_csv(output_file, index=False)

print(f"Filtered prompts saved to {output_file}.")

Filtered prompts saved to filtered_prompts.csv.
