In [4]:
pip install pandas app-store-scraper scikit-learn nltk flask

Note: you may need to restart the kernel to use updated packages.


In [2]:
!pip install --upgrade urllib3 requests six

Collecting requests
  Using cached requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Using cached requests-2.32.5-py3-none-any.whl (64 kB)
Installing collected packages: requests
  Attempting uninstall: requests
    Found existing installation: requests 2.23.0
    Uninstalling requests-2.23.0:
      Successfully uninstalled requests-2.23.0
Successfully installed requests-2.32.5


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
app-store-scraper 0.3.5 requires requests==2.23.0, but you have requests 2.32.5 which is incompatible.


In [29]:
# Step 1: Install the alternative scraper library
!pip install app-store-web-scraper

# Step 2: Import the necessary libraries
from app_store_web_scraper import AppStoreEntry
import pandas as pd

# Step 3: Define the app details
app_id = '768469908'
country = 'us'

# Step 4: Scrape the reviews using the new library
app = AppStoreEntry(app_id=app_id, country=country)

# First, create an empty list to store the reviews
reviews_list = []

# Then, loop through the reviews and add them to the list
# Note: The underlying Apple API this library uses often limits results to a maximum of 500 reviews per app.
for review in app.reviews(limit=2000):
    reviews_list.append({
        'date': review.date,
        'rating': review.rating,
        'userName': review.user_name,
        'title': review.title,
        'review': review.review
    })

# Step 5: Convert the list of reviews into a pandas DataFrame
reviews_df = pd.DataFrame(reviews_list)

# Step 6: Save the data to a CSV file and display the results
if not reviews_df.empty:
    reviews_df.to_csv('youcam_reviews.csv', index=False)
    print(f"Successfully scraped {len(reviews_df)} reviews and saved to youcam_reviews.csv")
    print("Here are the first 5 rows of your data:")
    print(reviews_df.head())
else:
    print("Failed to scrape any reviews. The app may not have reviews in this region, or the API is temporarily unavailable.")




  'review': review.review


Successfully scraped 500 reviews and saved to youcam_reviews.csv
Here are the first 5 rows of your data:
                       date  rating         userName  \
0 2025-09-20 08:10:15-07:00       1           jzy612   
1 2025-09-19 18:36:43-07:00       5  Crea tus videos   
2 2025-09-18 17:40:54-07:00       3           𝕓𝕣𝕒𝕟𝕕𝕚   
3 2025-09-08 08:04:35-07:00       1   Mattysclerosis   
4 2025-09-08 07:04:12-07:00       1        BREasy518   

                                title  \
0                         Bad service   
1                            La mejor   
2                        Very glitchy   
3                                Scam   
4  Took payment before trial was over   

                                              review  
0  At first it was all good and when i try to sub...  
1       Me encanta recomendada para editar tus fotos  
2                Slow and glitchy. Frustrated to use  
3  Said of you paid for the pro version you get 1...  
4  I downloaded and looked at the ap

In [33]:
# Step 1: Load the CSV file into a pandas DataFrame
import pandas as pd
df = pd.read_csv('youcam_reviews.csv')

# Step 2: Select only the columns we need for this project ('review' and 'rating')
df = df[['review', 'rating']]

# Step 3: Remove 3-star reviews, as they are often neutral and can confuse the model
df = df[df['rating']!= 3]

# Step 4: Create a new 'sentiment' column based on the rating.
# If the rating is greater than 3 (i.e., 4 or 5), we label it as 1 (Positive).
# Otherwise (i.e., 1 or 2), we label it as 0 (Negative).
df['sentiment'] = df['rating'].apply(lambda rating: 1 if rating > 3 else 0)

# Step 5: Drop the original 'rating' column as we no longer need it
df = df.drop('rating', axis=1)

# Step 6: Check the distribution of positive vs. negative reviews and display the first 5 rows
print("Dataset shape:", df.shape)
print("\nSentiment distribution:")
print(df['sentiment'].value_counts())
print("\nHere are the first 5 rows of the prepared data:")
df.head()

Dataset shape: (471, 2)

Sentiment distribution:
sentiment
0    264
1    207
Name: count, dtype: int64

Here are the first 5 rows of the prepared data:


Unnamed: 0,review,sentiment
0,At first it was all good and when i try to sub...,0
1,Me encanta recomendada para editar tus fotos,1
3,Said of you paid for the pro version you get 1...,0
4,I downloaded and looked at the app said a 7dat...,0
5,"This was a fantastic app, so much so AI purcha...",0


In [38]:
# Step 1: Download necessary NLTK data files
# This is a one-time download. NLTK (Natural Language Toolkit) needs these files to understand stopwords,
# how to tokenize sentences, and how to find the root of a word (lemmatization).
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Step 2: Create a function to clean the text
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Initialize the lemmatizer and stop words list
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Ensure the input is a string
    if not isinstance(text, str):
        return ""
    
    # 1. Convert to lowercase
    text = text.lower()
    
    # 2. Remove punctuation and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    
    # 3. Tokenize the text into words
    tokens = word_tokenize(text)
    
    # 4. Remove stop words and lemmatize
    processed_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    # 5. Join the words back into a single string
    return " ".join(processed_tokens)

# Step 3: Apply the preprocessing function to the 'review' column
# This creates a new column 'processed_review' with the cleaned text.
df['processed_review'] = df['review'].apply(preprocess_text)

# Step 4: Display the original review and the new processed review for comparison
print("Text preprocessing complete.")
print("Here's a comparison of the original vs. processed reviews:")
print(df[['review', 'processed_review']].head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tonyt\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tonyt\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tonyt\AppData\Roaming\nltk_data...


Text preprocessing complete.
Here's a comparison of the original vs. processed reviews:
                                              review  \
0  At first it was all good and when i try to sub...   
1       Me encanta recomendada para editar tus fotos   
3  Said of you paid for the pro version you get 1...   
4  I downloaded and looked at the app said a 7dat...   
5  This was a fantastic app, so much so AI purcha...   

                                    processed_review  
0  first good try subscribe pick cheapest one did...  
1          encanta recomendada para editar tus fotos  
3  said paid pro version get free image video per...  
4  downloaded looked app said dat trial use immed...  
5  fantastic app much ai purchased vip plan year ...  


In [40]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Step 1: Define your features (X) and target (y)
# X is the processed text you want to use for prediction.
# y is the sentiment label (0 or 1) you want to predict.
X = df['processed_review']
y = df['sentiment']

# Step 2: Split the data into training and testing sets
# We'll use 80% of the data for training and 20% for testing.
# 'stratify=y' ensures that the proportion of positive and negative reviews is the same in both sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 3: Initialize and fit the TF-IDF Vectorizer
# This will learn the vocabulary from your training data and convert the text into numerical vectors.
# 'max_features=5000' limits the vocabulary to the 5000 most frequent words.
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Learn the vocabulary and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Only transform the test data using the learned vocabulary
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Step 4: Train the Logistic Regression model
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train_tfidf, y_train)

# Step 5: Make predictions on the test set
y_pred = lr_model.predict(X_test_tfidf)

# Step 6: Evaluate the model's performance
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))

Accuracy: 0.9158

Classification Report:
              precision    recall  f1-score   support

    Negative       0.89      0.96      0.93        53
    Positive       0.95      0.86      0.90        42

    accuracy                           0.92        95
   macro avg       0.92      0.91      0.91        95
weighted avg       0.92      0.92      0.92        95



In [42]:
import joblib

# Step 1: Save the trained Logistic Regression model
joblib.dump(lr_model, 'sentiment_model.pkl')

# Step 2: Save the TF-IDF Vectorizer
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

print("Model and vectorizer have been saved to files:")
print(" - sentiment_model.pkl")
print(" - tfidf_vectorizer.pkl")

Model and vectorizer have been saved to files:
 - sentiment_model.pkl
 - tfidf_vectorizer.pkl


In [44]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Step 1: Filter for only the negative reviews from your dataframe
negative_reviews = df[df['sentiment'] == 0]['processed_review']

# Step 2: Create a new vectorizer for topic modeling
# LDA works better with simple word counts, so we use CountVectorizer instead of TF-IDF.
count_vectorizer = CountVectorizer(max_df=0.9, min_df=5, stop_words='english')
doc_term_matrix = count_vectorizer.fit_transform(negative_reviews)

# Step 3: Train the LDA model to find 5 distinct topics
# n_components is the number of topics you want to find. 5 is a good starting point.
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(doc_term_matrix)

# Step 4: Display the top words for each discovered topic
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx+1}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

print("Top words for each identified user pain point (topic):")
display_topics(lda, count_vectorizer.get_feature_names_out(), 10)

Top words for each identified user pain point (topic):
Topic 1:
app money thing ad tool pay lot away reason use
Topic 2:
free trial app charged refund year apple subscription day cancel
Topic 3:
app paying subscription pay photo make image premium need service
Topic 4:
app money time pay dont work use like waste month
Topic 5:
app feature photo use used pay im time free editing


In [54]:
# This function gets the top words for each topic
def get_topics(model, feature_names, no_top_words):
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        topic_name = f"Pain Point {topic_idx+1}"
        topic_words = " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])
        # For the chart, we'll use the model's internal topic strength as a placeholder value
        topic_strength = int(topic.sum()) 
        topics.append([topic_name, topic_strength, topic_words])
    return topics

# Get the topics and their top words
topics = get_topics(lda, count_vectorizer.get_feature_names_out(), 5)

# Convert to a pandas DataFrame with the correct column names
topics_df = pd.DataFrame(topics, columns=["Topic", "Strength", "Top Words"])

# Save the DataFrame to a CSV file
topics_df.to_csv('topics.csv', index=False)

print("Successfully saved topic data to topics.csv")
print(topics_df)


Successfully saved topic data to topics.csv
          Topic  Strength                          Top Words
0  Pain Point 1       303            app money thing ad tool
1  Pain Point 2       918      free trial app charged refund
2  Pain Point 3       405  app paying subscription pay photo
3  Pain Point 4       727            app money time pay dont
4  Pain Point 5      1050         app feature photo use used
