<a href="https://colab.research.google.com/github/vaishnavi-web28/software-reviews-dashboard/blob/main/Master_Project_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cell 1 — Install dependencies
!pip install --quiet pandas textblob scikit-learn umap-learn sentence-transformers matplotlib wordcloud


In [6]:
# Cell 2 — Upload your Software.jsonl file manually
from google.colab import files

# This will pop up an upload dialog—select your Software.jsonl from your local machine
uploaded = files.upload()

# After upload, the filename will be the key of `uploaded`
input_filename = next(iter(uploaded))
print("Uploaded file:", input_filename)


Saving software_reviews_100000.csv to software_reviews_100000.csv
Uploaded file: software_reviews_100000.csv


In [8]:
# Cell 3 — Load your Software reviews CSV with robust timestamp parsing
import pandas as pd

# This should match the name of your uploaded CSV
input_filename = 'software_reviews_100000.csv'
print("Loading:", input_filename)

# 1) Read in the CSV
df = pd.read_csv(input_filename)

# 2) Ensure we only keep the first 100k rows (if your file is larger)
if len(df) > 100_000:
    df = df.iloc[:100_000].reset_index(drop=True)

# 3) Parse the Unix timestamps into a datetime column safely
#    Some timestamps may be in seconds, others in milliseconds, or could be corrupt.
#    We’ll attempt both and coerce errors to NaT.
df['review_date'] = pd.to_datetime(df['timestamp'], unit='s', errors='coerce')

# For any entries that failed as seconds, try parsing as milliseconds
mask = df['review_date'].isna()
if mask.any():
    df.loc[mask, 'review_date'] = pd.to_datetime(
        df.loc[mask, 'timestamp'], unit='ms', errors='coerce'
    )

# 4) Report parsing results
n_total = len(df)
n_bad = df['review_date'].isna().sum()
print(f"Total rows: {n_total:,} — Unparsed dates: {n_bad:,}")

# 5) (Optional) Drop any rows where review_date is still NaT
if n_bad > 0:
    df = df[df['review_date'].notna()].reset_index(drop=True)
    print(f"Dropped {n_bad:,} rows without valid dates — Remaining: {len(df):,}")

# 6) Inspect the DataFrame
df.head()


Loading: software_reviews_100000.csv
Total rows: 100,000 — Unparsed dates: 0


Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,verified_purchase,helpful_vote,review_date
0,1.0,malware,mcaffee IS malware,[],B07BFS3G7P,B0BQSK9QCF,AGCI7FAH4GL5FI65HYLKWTMFZ2CQ,1562182632076,False,0,2019-07-03 19:37:12.076
1,5.0,Lots of Fun,I love playing tapped out because it is fun to...,[],B00CTQ6SIG,B00CTQ6SIG,AHSPLDNW5OOUK2PLH7GXLACFBZNQ,1424120336000,True,0,2015-02-16 20:58:56.000
2,5.0,Light Up The Dark,I love this flashlight app! It really illumin...,[],B0066WJLU6,B0066WJLU6,AHSPLDNW5OOUK2PLH7GXLACFBZNQ,1362399267000,True,0,2013-03-04 12:14:27.000
3,4.0,Fun game,One of my favorite games,[],B00KCYMAWK,B00KCYMAWK,AH6CATODIVPVUOJEWHRSRCSKAOHA,1561061428662,True,0,2019-06-20 20:10:28.662
4,4.0,I am not that good at it but my kids are,Cute game. I am not that good at it but my kid...,[],B00P1RK566,B00P1RK566,AEINY4XOINMMJCK5GZ3M6MMHBN6A,1418257196000,True,0,2014-12-11 00:19:56.000


In [9]:
# Cell 4 — Compute sentiment polarity with TextBlob
from textblob import TextBlob

def compute_polarity(text):
    return TextBlob(text).sentiment.polarity

df['sentiment'] = df['text'].fillna('').apply(compute_polarity)
df[['text','sentiment']].head()


Unnamed: 0,text,sentiment
0,mcaffee IS malware,0.0
1,I love playing tapped out because it is fun to...,0.433333
2,I love this flashlight app! It really illumin...,0.332292
3,One of my favorite games,0.5
4,Cute game. I am not that good at it but my kid...,0.35625


In [10]:
# Cell 5 — Save the DataFrame with sentiment to CSV
out_csv = 'software_reviews_100000_with_sentiment.csv'
df.to_csv(out_csv, index=False)
print("Saved sentiment‑augmented reviews to:", out_csv)


Saved sentiment‑augmented reviews to: software_reviews_100000_with_sentiment.csv


In [11]:
# Cell 6 — Topic Modeling (LDA)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

vectorizer = CountVectorizer(max_features=2_000, stop_words='english')
dtm = vectorizer.fit_transform(df['text'].fillna(''))

n_topics = 5
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
topic_dist = lda.fit_transform(dtm)

topic_cols = [f"topic_{i+1}" for i in range(n_topics)]
df_topics = pd.DataFrame(topic_dist, columns=topic_cols)
df_topics['date'] = df['review_date'].dt.to_period('M').astype(str)

out_topics = 'software_topics_100000.csv'
df_topics.to_csv(out_topics, index=False)
print("Saved topic‑distribution CSV to:", out_topics)

# Optional: print top words per topic
words = vectorizer.get_feature_names_out()
for i, comp in enumerate(lda.components_):
    terms = [words[idx] for idx in comp.argsort()[-10:][::-1]]
    print(f"Topic {i+1} top words: {', '.join(terms)}")


Saved topic‑distribution CSV to: software_topics_100000.csv
Topic 1 top words: app, love, use, kindle, easy, great, like, work, phone, old
Topic 2 top words: game, fun, play, like, games, time, love, playing, great, good
Topic 3 top words: br, software, program, use, computer, windows, version, like, just, new
Topic 4 top words: app, great, good, free, works, watch, tv, shows, like, amazon
Topic 5 top words: product, years, ok, software, year, download, amazon, used, version, time


In [12]:
# Cell 7 — Generate embeddings & UMAP projection
from sentence_transformers import SentenceTransformer
import umap
import numpy as np

# 1) Encode all review texts
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df['text'].fillna('').tolist(), show_progress_bar=True)

# 2) UMAP to 2D
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, random_state=42)
proj_2d = reducer.fit_transform(embeddings)

# 3) Save both locally in Colab
emb_file = 'embeddings_100000.npy'
proj_file = 'embeddings_100000_2d.npy'
np.save(emb_file, embeddings)
np.save(proj_file, proj_2d)

print("Saved full embeddings to:", emb_file)
print("Saved 2D projections to:", proj_file)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/3125 [00:00<?, ?it/s]

  warn(


Saved full embeddings to: embeddings_100000.npy
Saved 2D projections to: embeddings_100000_2d.npy


In [20]:
cd ~/Downloads/Code
git init
git add .
git commit -m "Initial commit: Streamlit dashboard and data artifacts"

SyntaxError: invalid syntax (ipython-input-20-4209738298.py, line 1)

In [None]:
git add YourNotebook_clean.ipynb
git commit -m "Fix: remove incomplete metadata.widgets entries"
git push