In [None]:
import pandas as pd
import string

In [None]:
# Read the CSV file containing steam reviews
df = pd.read_csv("steam_reviews.csv")

# Select relevant columns
df = df[['recommendation', 'review', 'title']]

# Create a new column 'class' based on the 'recommendation' values
df['class'] = df.recommendation.apply(lambda x: 0 if x=="Not Recommended" else 1)
df.info()

In [None]:
# Drop rows with missing values from the DataFrame
df = df.dropna()

In [None]:
# Filter rows where the length of the review is between 50 and 120 characters
df = df[(df.review.apply(len)>50)&(df.review.apply(len)<120)]

In [None]:
# Calculate the number of reviews per 'title'
title_review_counts = df.groupby('title')['review'].count()

# Find 'title' groups with fewer than 29 reviews
titles_to_drop = title_review_counts[title_review_counts < 29].index

# Remove these 'title' groups from the DataFrame
data = df[~df['title'].isin(titles_to_drop)]

print(data)


In [None]:
# Create a regex for punctuation marks at the end of the text
punctuation = string.punctuation
regex = f"[{punctuation}]$"

# Add a period (.) to reviews that do not end with a punctuation mark
data['review'] = data['review'].apply(
    lambda x: f"{x}." if not pd.Series([x]).str.contains(regex).iloc[0] else x
)

In [None]:
# Group by 'title' and 'recommendation', and count the number of reviews for each combination
review_count_df = data.groupby(['title', 'recommendation'])['review'].count().reset_index()

# Group by 'title' and 'recommendation', and concatenate (sum) the reviews for each combination
review_texts_df = data.groupby(['title', 'recommendation'])['review'].sum().reset_index()

# Merge 'review_texts_df' and 'review_count_df' DataFrames on 'title' and 'recommendation' columns
# This combines both the total review text and the count of reviews per combination
merged_df = pd.merge(review_texts_df, review_count_df, on=['title', 'recommendation'], how='left', suffixes=('_text', '_count'))


In [None]:
result = []

# Extract the title, recommendation, review count, and review text for each row
for index, row in merged_df.iterrows():
    title = row['title']
    recommendation = row['recommendation']
    review_count = row['review_count']
    review_text = row['review_text']
    
    # Create a formatted string with the extracted information
    result.append(f"For '{title}' '{review_count}' {recommendation} reviews: '{review_text}'")

# Join all the formatted strings into a single document
document = "\n".join(result)

In [None]:
# Write the generated text to a file
with open('reviews_document.txt', 'w', encoding='utf-8') as file:
    file.write(document)

print("Text created!")