# Text Processing Techniques


In [2]:
#!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m83.2 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [9]:
# Import necessary libraries
import pandas as pd
import spacy
from collections import Counter

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Load the dataset
data = pd.read_csv('restaurant_reviews_az.csv')

# Display the summary of the dataset
print(data.info())
print(data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14882 entries, 0 to 14881
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   review_id    14882 non-null  object 
 1   user_id      14882 non-null  object 
 2   business_id  14882 non-null  object 
 3   stars        14881 non-null  float64
 4   useful       14881 non-null  float64
 5   funny        14881 non-null  float64
 6   cool         14881 non-null  float64
 7   text         14881 non-null  object 
 8   date         14881 non-null  object 
dtypes: float64(4), object(5)
memory usage: 1.0+ MB
None
                review_id                 user_id             business_id  \
0  IVS7do_HBzroiCiymNdxDg  fdFgZQQYQJeEAshH4lxSfQ  sGy67CpJctjeCWClWqonjA   
1  QP2pSzSqpJTMWOCuUuyXkQ  JBLWSXBTKFvJYYiM-FnCOQ  3w7NRntdQ9h0KwDsksIt5Q   
2  oK0cGYStgDOusZKz9B1qug  2_9fKnXChUjC5xArfF8BLg  OMnPtRGmbY8qH_wIILfYKA   
3  E_ABvFCNVLbfOgRg3Pv1KQ  9MExTQ76GSKhxSWnTS901g  V9XlikTxq0My4g

In [10]:
# Filter for 1-star and 5-star reviews
one_star_reviews = data[data['stars'] == 1]
five_star_reviews = data[data['stars'] == 5]

# Combine into a single DataFrame for further processing
selected_reviews = pd.concat([one_star_reviews, five_star_reviews])

# Display basic statistics
print("1-star reviews:", one_star_reviews.shape[0])
print("5-star reviews:", five_star_reviews.shape[0])

1-star reviews: 2454
5-star reviews: 7747


In [11]:
# Define text processing functions

def tokenize_and_segment(doc):
    """Tokenize and segment sentences."""
    sentences = [sent.text for sent in doc.sents]
    tokens = [token.text for token in doc]
    return sentences, tokens

def pos_tagging(doc):
    """Part-of-speech tagging."""
    return [(token.text, token.pos_) for token in doc]

def lemmatize(doc):
    """Lemmatize the tokens."""
    return [token.lemma_ for token in doc]

def named_entity_recognition(doc):
    """Extract named entities."""
    return [(ent.text, ent.label_) for ent in doc.ents]

def parse_dependencies(doc):
    """Extract dependency relationships."""
    return [(token.text, token.dep_, token.head.text) for token in doc]

# Apply spaCy pipeline to reviews
one_star_reviews['processed'] = one_star_reviews['text'].apply(lambda x: nlp(x))
five_star_reviews['processed'] = five_star_reviews['text'].apply(lambda x: nlp(x))

# Analyze each text processing technique
for review_category, reviews in zip(['1-star', '5-star'], [one_star_reviews, five_star_reviews]):
    print(f"\n--- {review_category.upper()} REVIEWS ---\n")

    # Example review for demonstration
    example_doc = reviews['processed'].iloc[0]
    print("Original Text:", example_doc.text)

    # 1. Tokenization and Sentence Segmentation
    sentences, tokens = tokenize_and_segment(example_doc)
    print("Sentences:", sentences)
    print("Tokens:", tokens)

    # 2. POS Tagging
    pos_tags = pos_tagging(example_doc)
    print("POS Tags:", pos_tags)

    # 3. Lemmatization
    lemmatized_tokens = lemmatize(example_doc)
    print("Lemmatized Tokens:", lemmatized_tokens)

    # 4. Named Entity Recognition (NER)
    entities = named_entity_recognition(example_doc)
    print("Named Entities:", entities)

    # 5. Dependency Parsing
    dependencies = parse_dependencies(example_doc)
    print("Dependency Parsing:", dependencies)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_star_reviews['processed'] = one_star_reviews['text'].apply(lambda x: nlp(x))



--- 1-STAR REVIEWS ---

Original Text: I stay at the Main Hotel at the Casino from July 11 to July 13; it was the WORST experience I've ever had.  For years I have supported this hotel and the casino; however, this time...it was a disaster. Will I go back, hopefullly, NO!
Sentences: ["I stay at the Main Hotel at the Casino from July 11 to July 13; it was the WORST experience I've ever had.  ", 'For years I have supported this hotel and the casino; however, this time...', 'it was a disaster.', 'Will I go back, hopefullly, NO!']
Tokens: ['I', 'stay', 'at', 'the', 'Main', 'Hotel', 'at', 'the', 'Casino', 'from', 'July', '11', 'to', 'July', '13', ';', 'it', 'was', 'the', 'WORST', 'experience', 'I', "'ve", 'ever', 'had', '.', ' ', 'For', 'years', 'I', 'have', 'supported', 'this', 'hotel', 'and', 'the', 'casino', ';', 'however', ',', 'this', 'time', '...', 'it', 'was', 'a', 'disaster', '.', 'Will', 'I', 'go', 'back', ',', 'hopefullly', ',', 'NO', '!']
POS Tags: [('I', 'PRON'), ('stay', 'VERB

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  five_star_reviews['processed'] = five_star_reviews['text'].apply(lambda x: nlp(x))


In [12]:
# Define a function to extract nouns
def extract_nouns(doc):
    return [token.text for token in nlp(doc) if token.pos_ == "NOUN"]

# Extract nouns for each review category
nouns_1_star = [noun for review in one_star_reviews['text'] for noun in extract_nouns(review)]
nouns_5_star = [noun for review in five_star_reviews['text'] for noun in extract_nouns(review)]

# Display top 20 nouns
print("Top 20 nouns in 1-star reviews:", Counter(nouns_1_star).most_common(20))
print("Top 20 nouns in 5-star reviews:", Counter(nouns_5_star).most_common(20))


Top 20 nouns in 1-star reviews: [('food', 1754), ('order', 1040), ('time', 971), ('place', 875), ('service', 849), ('minutes', 678), ('restaurant', 590), ('manager', 430), ('people', 416), ('location', 381), ('customer', 374), ('chicken', 367), ('table', 333), ('experience', 288), ('pizza', 284), ('meal', 269), ('way', 260), ('staff', 259), ('money', 252), ('hour', 245)]
Top 20 nouns in 5-star reviews: [('food', 4736), ('place', 3141), ('service', 2128), ('time', 1714), ('restaurant', 1178), ('staff', 1124), ('pizza', 1019), ('menu', 947), ('chicken', 782), ('sauce', 672), ('experience', 634), ('meal', 606), ('order', 599), ('breakfast', 579), ('lunch', 573), ('spot', 547), ('cheese', 538), ('flavor', 531), ('town', 528), ('side', 524)]


In [13]:
# Define a function to extract adjectives
def extract_adjectives(doc):
    return [token.text for token in nlp(doc) if token.pos_ == "ADJ"]

# Extract adjectives for each review category
adjectives_1_star = [adj for review in one_star_reviews['text'] for adj in extract_adjectives(review)]
adjectives_5_star = [adj for review in five_star_reviews['text'] for adj in extract_adjectives(review)]

# Display top 20 adjectives
print("Top 20 adjectives in 1-star reviews:", Counter(adjectives_1_star).most_common(20))
print("Top 20 adjectives in 5-star reviews:", Counter(adjectives_5_star).most_common(20))


Top 20 adjectives in 1-star reviews: [('good', 479), ('other', 370), ('bad', 328), ('rude', 293), ('cold', 269), ('more', 248), ('last', 220), ('worst', 218), ('first', 214), ('better', 192), ('great', 187), ('wrong', 184), ('disappointed', 175), ('same', 170), ('terrible', 161), ('many', 159), ('horrible', 154), ('sure', 154), ('little', 150), ('old', 147)]
Top 20 adjectives in 5-star reviews: [('great', 2926), ('good', 2770), ('delicious', 1986), ('amazing', 1512), ('best', 1347), ('friendly', 1289), ('fresh', 971), ('Great', 941), ('nice', 919), ('more', 619), ('favorite', 616), ('perfect', 613), ('hot', 597), ('other', 592), ('little', 585), ('excellent', 584), ('first', 517), ('wonderful', 500), ('new', 498), ('tasty', 473)]


In [14]:
# Define a function to extract verbs
def extract_verbs(doc):
    return [token.text for token in nlp(doc) if token.pos_ == "VERB"]

# Extract verbs for each review category
verbs_1_star = [verb for review in one_star_reviews['text'] for verb in extract_verbs(review)]
verbs_5_star = [verb for review in five_star_reviews['text'] for verb in extract_verbs(review)]

# Display top 20 verbs
print("Top 20 verbs in 1-star reviews:", Counter(verbs_1_star).most_common(20))
print("Top 20 verbs in 5-star reviews:", Counter(verbs_5_star).most_common(20))


Top 20 verbs in 1-star reviews: [('had', 1145), ('get', 790), ('have', 768), ('ordered', 710), ('go', 687), ('said', 620), ('got', 596), ('told', 558), ('asked', 537), ('came', 396), ('know', 386), ('went', 386), ('going', 346), ('give', 326), ('eat', 312), ('come', 310), ('took', 309), ('take', 307), ('made', 281), ('make', 278)]
Top 20 verbs in 5-star reviews: [('had', 3314), ('have', 1748), ('go', 1368), ('get', 1193), ('ordered', 1152), ('got', 1106), ('try', 964), ('recommend', 950), ('love', 937), ('made', 759), ('come', 728), ('came', 642), ('eat', 638), ('make', 559), ('tried', 546), ('order', 511), ('take', 510), ('has', 490), ('loved', 454), ('wait', 448)]


In [15]:
# Define a function to extract named entities
def extract_named_entities(doc):
    entities = [ent.text for ent in nlp(doc).ents]
    return entities

# Extract named entities for each review category
entities_1_star = [entity for review in one_star_reviews['text'] for entity in extract_named_entities(review)]
entities_5_star = [entity for review in five_star_reviews['text'] for entity in extract_named_entities(review)]

# Display top 20 named entities
print("Top 20 named entities in 1-star reviews:", Counter(entities_1_star).most_common(20))
print("Top 20 named entities in 5-star reviews:", Counter(entities_5_star).most_common(20))


Top 20 named entities in 1-star reviews: [('one', 267), ('two', 251), ('first', 245), ('2', 219), ('Tucson', 171), ('3', 163), ('today', 133), ('4', 104), ('5', 99), ('1', 96), ('three', 90), ('half', 82), ('First', 79), ('second', 77), ('10', 76), ('Mexican', 72), ('tonight', 67), ('20', 58), ('20 minutes', 58), ('zero', 57)]
Top 20 named entities in 5-star reviews: [('Tucson', 1478), ('first', 552), ('Mexican', 418), ('two', 402), ('one', 385), ('5', 249), ('2', 221), ('One', 209), ('today', 192), ('First', 173), ('Chinese', 171), ('3', 167), ('Italian', 132), ('Love', 119), ('half', 113), ('4', 110), ('French', 104), ('Arizona', 103), ('second', 100), ('three', 100)]


The language used in 1-star reviews frequently includes negative nouns like "waiter," "service," or "manager" and adjectives such as "terrible" or "slow." Positive reviews often contain nouns like "food," "experience," or "meal" and adjectives like "amazing" or "delicious." Good restaurant experiences revolve around high food quality, excellent service, and a pleasant atmosphere, as evident in the 5-star reviews.

I used GenAI tools to guide the structure of the assignment but wrote and implemented all the code independently. I did not collaborate with others on this assignment.

In [16]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [18]:
!jupyter nbconvert --to html /content/drive/MyDrive/Topic Modelling/LA1_Jayatama_UjwalaKavya.ipynb

This application is used to convert notebook files (*.ipynb)
        to various other formats.


Options
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
    <cmd> --help-all

--debug
    set log level to logging.DEBUG (maximize logging output)
    Equivalent to: [--Application.log_level=10]
--show-config
    Show the application's configuration (human-readable format)
    Equivalent to: [--Application.show_config=True]
--show-config-json
    Show the application's configuration (json format)
    Equivalent to: [--Application.show_config_json=True]
--generate-config
    generate default config file
    Equivalent to: [--JupyterApp.generate_config=True]
-y
    Answer yes to any questions instead of prompting.
    Equivalent to: [--JupyterApp.answer_yes=True]
--execute
    Execute the notebook prior to export.
    Equivalent to: [--ExecutePr