### 과정

In [5]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize

# NLTK의 필요한 리소스 다운로드 (토큰화를 위해)
nltk.download('punkt')

df = pd.read_csv('/content/drive/MyDrive/IMbank_텍스트마이닝/amazon_reviews_all.tsv',sep='\t')

df['Review'] = df['Review'].str.lower() # 소문자로 변경 - 정규화
df['word_tokens']=df['Review'].apply(word_tokenize) # 소문자 처리한걸 토큰화
df

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,Review,Star,word_tokens
0,i use it and it shows your skin glow but your ...,1,"[i, use, it, and, it, shows, your, skin, glow,..."
1,if you are someone who regularly purges when t...,1,"[if, you, are, someone, who, regularly, purges..."
2,i tried so hard to like thus spray but unfortu...,1,"[i, tried, so, hard, to, like, thus, spray, bu..."
3,this serum has artificial fragrance and theref...,1,"[this, serum, has, artificial, fragrance, and,..."
4,first off i want to say i don’t have sensitive...,1,"[first, off, i, want, to, say, i, don, ’, t, h..."
...,...,...,...
495,this was recommended by “ painted by spencer” ...,5,"[this, was, recommended, by, “, painted, by, s..."
496,deja un rico aroma y hidrata,5,"[deja, un, rico, aroma, y, hidrata]"
497,it’s work very well i recommend it 🥰🥰🥰🥰 great ...,5,"[it, ’, s, work, very, well, i, recommend, it,..."
498,this really melts your makeup together as sett...,5,"[this, really, melts, your, makeup, together, ..."


In [7]:
from collections import Counter

def clean_by_freq(tokenized_words, cut_off_count):
    vocab = Counter(tokenized_words)

    # 빈도수가 cut_off_count 이하인 단어를 제거하는 코드를 작성해 주세요
    uncommon_words = {key for key, value in vocab.items() if value <= cut_off_count}
    cleaned_words = [word for word in tokenized_words if word not in uncommon_words]

    return cleaned_words


def clean_by_len(tokenized_words, cut_off_length):
    cleaned_words = []

    for word in tokenized_words:
        # 길이가 cut_off_length 이하인 단어 제거하는 코드를 작성해 주세요
        if len(word) > cut_off_length:
            cleaned_words.append(word)

    return cleaned_words


# # 문제의 조건에 맞게 함수를 호출해 주세요
# cleaned_by_freq = clean_by_freq(tokenized_words, 2)
# cleaned_words = clean_by_len(cleaned_by_freq, 2)


In [9]:
# 불용어 제거 함수
def clean_by_stopwords(tokenized_words, stop_words_set):
    cleaned_words = []

    for word in tokenized_words:
        if word not in stop_words_set:
            cleaned_words.append(word)

    return cleaned_words

# cleaned_by_stopwords = clean_by_stopwords(cleaned_by_freq_len, stopwords_set)

In [8]:
from nltk.stem import PorterStemmer

# 포터 스테머 어간 추출 함수
def stemming_by_porter(tokenized_words):
    porter_stemmer = PorterStemmer()
    porter_stemmed_words = []

    for word in tokenized_words:
        stem = porter_stemmer.stem(word)
        porter_stemmed_words.append(stem)

    return porter_stemmed_words

In [10]:
# 불용어 처리
from nltk.corpus import stopwords
nltk.download('stopwords')


stopwords_set = set(stopwords.words('english')) # 불용어 처리 세트 생성

df['cleaned_tokens']=df['word_tokens'].apply(lambda x : clean_by_freq(x,10)) # 빈도가 1 이하인것들은 삭제
df['cleaned_tokens']=df['cleaned_tokens'].apply(lambda x : clean_by_len(x,2)) # 길이가 2 이하인것들은 삭제
df['cleaned_tokens']=df['cleaned_tokens'].apply(lambda x : clean_by_stopwords(x,stopwords_set)) # 불용어 처리

df['stemmed_tokens']=df['cleaned_tokens'].apply(stemming_by_porter) # 어간 추출  -> 문제 movie가 movi 이렇게 e 가 빠져있음
df

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,Review,Star,word_tokens,cleaned_tokens,stemmed_tokens
0,i use it and it shows your skin glow but your ...,1,"[i, use, it, and, it, shows, your, skin, glow,...","[use, skin, skin, dry, used, face, face, feels...","[use, skin, skin, dri, use, face, face, feel, ..."
1,if you are someone who regularly purges when t...,1,"[if, you, are, someone, who, regularly, purges...","[new, product, product, day, day, acne, produc...","[new, product, product, day, day, acn, product..."
2,i tried so hard to like thus spray but unfortu...,1,"[i, tried, so, hard, to, like, thus, spray, bu...","[like, spray, unfortunately, like, spray, unfo...","[like, spray, unfortun, like, spray, unfortun,..."
3,this serum has artificial fragrance and theref...,1,"[this, serum, has, artificial, fragrance, and,...","[fragrance, skin, fragrance, skin, fragrance]","[fragranc, skin, fragranc, skin, fragranc]"
4,first off i want to say i don’t have sensitive...,1,"[first, off, i, want, to, say, i, don, ’, t, h...",[],[]
...,...,...,...,...,...
495,this was recommended by “ painted by spencer” ...,5,"[this, was, recommended, by, “, painted, by, s...","[love, makeup, oily, spray, spray, makeup, lov...","[love, makeup, oili, spray, spray, makeup, lov..."
496,deja un rico aroma y hidrata,5,"[deja, un, rico, aroma, y, hidrata]",[],[]
497,it’s work very well i recommend it 🥰🥰🥰🥰 great ...,5,"[it, ’, s, work, very, well, i, recommend, it,...",[],[]
498,this really melts your makeup together as sett...,5,"[this, really, melts, your, makeup, together, ...","[makeup, makeup]","[makeup, makeup]"


In [14]:
# 빈도 분석 / 명사,동사,형용사
from nltk.tag import pos_tag
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')  # 문장 및 단어 토큰화를 위한 데이터


from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

# 품사 태깅 함수
def pos_tagger(tokenized_sents):
    pos_tagged_words = []

    for sentence in tokenized_sents:
        # 단어 토큰화
        tokenized_words = word_tokenize(sentence)

        # 품사 태깅
        pos_tagged = pos_tag(tokenized_words)
        pos_tagged_words.extend(pos_tagged)

    return pos_tagged_words

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [23]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn

# 형용사,명사,부사,동사만 추출하는 함수
def penn_to_wn(tag):
    if tag.startswith('J'):
        return wn.ADJ # 형용사

    elif tag.startswith('N'):
        return wn.NOUN # 명사

    elif tag.startswith('R'):
        return wn.ADV # 부사

    elif tag.startswith('V'):
        return wn.VERB # 동사

    else:
        return tag


penn_to_wn("NNG")  # 확인

'n'

In [24]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer

# 필요한 NLTK 리소스 다운로드
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

def words_lematier(pos_tagger_words):
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = []

    for word, tag in pos_tagger_words:
        wn_tag = penn_to_wn(tag)
        if wn_tag in (wn.NOUN, wn.ADJ, wn.ADV, wn.VERB):
            stem = lemmatizer.lemmatize(word, wn_tag)
            lemmatized_words.append((stem))
        else:
            lemmatized_words.append((word))

    return lemmatized_words

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [25]:
df['Review'] = df['Review'].str.lower()
df['sent_tokens']=df['Review'].apply(sent_tokenize) # 문장단위로 나눠주기
df['pos_tagged_tokens']=df['sent_tokens'].apply(pos_tagger)


df['lemmatized_words']=df['pos_tagged_tokens'].apply(words_lematier)

In [26]:
# 빈도 1이하 없애고, 길이 2 이하 없애고 , 불용어 처리까지 해서 나온 결과 확인하기!

df['cleaned_tokens']=df['lemmatized_words'].apply(lambda x : clean_by_freq(x,10)) # 빈도가 1 이하인것들은 삭제
df['cleaned_tokens']=df['cleaned_tokens'].apply(lambda x : clean_by_len(x,2)) # 길이가 2 이하인것들은 삭제
df['cleaned_tokens']=df['cleaned_tokens'].apply(lambda x : clean_by_stopwords(x,stopwords_set)) # 불용어 처리

df['combined_corpus']=df['cleaned_tokens'].apply(lambda x :" ".join(x))
df

Unnamed: 0,Review,Star,word_tokens,cleaned_tokens,stemmed_tokens,sent_tokens,pos_tagged_tokens,lemmatized_words,combined_corpus
0,i use it and it shows your skin glow but your ...,1,"[i, use, it, and, it, shows, your, skin, glow,...",[],"[use, skin, skin, dri, use, face, face, feel, ...",[i use it and it shows your skin glow but your...,"[(i, NN), (use, VBP), (it, PRP), (and, CC), (i...","[i, use, it, and, it, show, your, skin, glow, ...",
1,if you are someone who regularly purges when t...,1,"[if, you, are, someone, who, regularly, purges...",[],"[new, product, product, day, day, acn, product...",[if you are someone who regularly purges when ...,"[(if, IN), (you, PRP), (are, VBP), (someone, N...","[if, you, be, someone, who, regularly, purge, ...",
2,i tried so hard to like thus spray but unfortu...,1,"[i, tried, so, hard, to, like, thus, spray, bu...",[],"[like, spray, unfortun, like, spray, unfortun,...",[i tried so hard to like thus spray but unfort...,"[(i, NN), (tried, VBD), (so, RB), (hard, JJ), ...","[i, try, so, hard, to, like, thus, spray, but,...",
3,this serum has artificial fragrance and theref...,1,"[this, serum, has, artificial, fragrance, and,...",[],"[fragranc, skin, fragranc, skin, fragranc]",[this serum has artificial fragrance and there...,"[(this, DT), (serum, NN), (has, VBZ), (artific...","[this, serum, have, artificial, fragrance, and...",
4,first off i want to say i don’t have sensitive...,1,"[first, off, i, want, to, say, i, don, ’, t, h...",[],[],[first off i want to say i don’t have sensitiv...,"[(first, RB), (off, IN), (i, NN), (want, VBP),...","[first, off, i, want, to, say, i, don, ’, t, h...",
...,...,...,...,...,...,...,...,...,...
495,this was recommended by “ painted by spencer” ...,5,"[this, was, recommended, by, “, painted, by, s...",[],"[love, makeup, oili, spray, spray, makeup, lov...",[this was recommended by “ painted by spencer”...,"[(this, DT), (was, VBD), (recommended, VBN), (...","[this, be, recommend, by, “, paint, by, spence...",
496,deja un rico aroma y hidrata,5,"[deja, un, rico, aroma, y, hidrata]",[],[],[deja un rico aroma y hidrata],"[(deja, NN), (un, JJ), (rico, NN), (aroma, NN)...","[deja, un, rico, aroma, y, hidrata]",
497,it’s work very well i recommend it 🥰🥰🥰🥰 great ...,5,"[it, ’, s, work, very, well, i, recommend, it,...",[],[],[it’s work very well i recommend it 🥰🥰🥰🥰 great...,"[(it, PRP), (’, VBZ), (s, JJ), (work, NN), (ve...","[it, ’, s, work, very, well, i, recommend, it,...",
498,this really melts your makeup together as sett...,5,"[this, really, melts, your, makeup, together, ...",[],"[makeup, makeup]",[this really melts your makeup together as set...,"[(this, DT), (really, RB), (melts, VB), (your,...","[this, really, melt, your, makeup, together, a...",


In [27]:
box = []
for words in df['cleaned_tokens']:
    box +=words

Counter(box).most_common(10)

[('skin', 12), ('spray', 11)]

### 정리

In [48]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem import WordNetLemmatizer

# 필요한 NLTK 리소스 다운로드
nltk.download('punkt')
nltk.download('stopwords')

# 불용어 세트
stopwords_set = set(stopwords.words('english'))

# 데이터 로드
df = pd.read_csv('/content/drive/MyDrive/IMbank_텍스트마이닝/amazon_reviews_all.tsv', sep='\t')

# 1. 소문자로 변환
df['Review'] = df['Review'].str.lower()

# 2. 문장 토큰화
df['sent_tokens'] = df['Review'].apply(sent_tokenize)  # 문장 단위로 나누기

# 3. 품사 태깅
def pos_tagger(sentences):
    tagged_tokens = []
    for sentence in sentences:
        tokens = word_tokenize(sentence)
        tagged_tokens.extend(pos_tag(tokens))
    return tagged_tokens

df['pos_tagged_tokens'] = df['sent_tokens'].apply(pos_tagger)

# 4. 표제어 처리 (Lemmatization)
lemmatizer = WordNetLemmatizer()
def words_lemmatizer(tagged_tokens):
    lemmatized = []
    for word, tag in tagged_tokens:
        if tag.startswith('NN'):
            pos = 'n'  # 명사
        elif tag.startswith('VB'):
            pos = 'v'  # 동사
        elif tag.startswith('JJ'):
            pos = 'a'  # 형용사
        elif tag.startswith('RB'):
            pos = 'r'  # 부사
        else:
            pos = 'n'  # 기본값은 명사

        lemmatized.append(lemmatizer.lemmatize(word, pos))
    return lemmatized

df['lemmatized_words'] = df['pos_tagged_tokens'].apply(words_lemmatizer)

# 5. 빈도 기준 토큰 제거 함수 (빈도가 1 이하인 토큰 삭제)
def clean_by_freq(tokens, min_count):
    token_counts = Counter(tokens)
    return [token for token in tokens if token_counts[token] > min_count]

# 6. 길이 기준 토큰 제거 함수 (길이가 2자 이하인 토큰 삭제)
def clean_by_len(tokens, min_length):
    return [token for token in tokens if len(token) > min_length]

# 7. 불용어 제거 함수
def clean_by_stopwords(tokens, stop_words_set):
    return [token for token in tokens if token not in stop_words_set]

# 8. 전처리 함수 적용
df['cleaned_tokens'] = df['lemmatized_words'].apply(lambda x: clean_by_freq(x, 1))  # 빈도가 1 이하인 것들은 삭제 --> 10개로하니 평점 5,4점 1개씩만나온다 skin spray
df['cleaned_tokens'] = df['cleaned_tokens'].apply(lambda x: clean_by_len(x, 2))      # 길이가 2 이하인 것들은 삭제
df['cleaned_tokens'] = df['cleaned_tokens'].apply(lambda x: clean_by_stopwords(x, stopwords_set))  # 불용어 처리

# 9. 결과 조합
df['combined_corpus'] = df['cleaned_tokens'].apply(lambda x: " ".join(x))

# 결과 확인
df

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Review,Star,sent_tokens,pos_tagged_tokens,lemmatized_words,cleaned_tokens,combined_corpus
0,i use it and it shows your skin glow but your ...,1,[i use it and it shows your skin glow but your...,"[(i, NN), (use, VBP), (it, PRP), (and, CC), (i...","[i, use, it, and, it, show, your, skin, glow, ...","[use, skin, skin, dry, use, line, face, face, ...",use skin skin dry use line face face feel hume...
1,if you are someone who regularly purges when t...,1,[if you are someone who regularly purges when ...,"[(if, IN), (you, PRP), (are, VBP), (someone, N...","[if, you, be, someone, who, regularly, purge, ...","[purge, new, product, product, purge, use, day...",purge new product product purge use day day da...
2,i tried so hard to like thus spray but unfortu...,1,[i tried so hard to like thus spray but unfort...,"[(i, NN), (tried, VBD), (so, RB), (hard, JJ), ...","[i, try, so, hard, to, like, thus, spray, but,...","[like, spray, unfortunately, break, break, lik...",like spray unfortunately break break like spra...
3,this serum has artificial fragrance and theref...,1,[this serum has artificial fragrance and there...,"[(this, DT), (serum, NN), (has, VBZ), (artific...","[this, serum, have, artificial, fragrance, and...","[fragrance, skin, fragrance, skin, fragrance]",fragrance skin fragrance skin fragrance
4,first off i want to say i don’t have sensitive...,1,[first off i want to say i don’t have sensitiv...,"[(first, RB), (off, IN), (i, NN), (want, VBP),...","[first, off, i, want, to, say, i, don, ’, t, h...",[],
...,...,...,...,...,...,...,...
495,this was recommended by “ painted by spencer” ...,5,[this was recommended by “ painted by spencer”...,"[(this, DT), (was, VBD), (recommended, VBN), (...","[this, be, recommend, by, “, paint, by, spence...","[love, makeup, oily, spray, spray, makeup, lov...",love makeup oily spray spray makeup love make ...
496,deja un rico aroma y hidrata,5,[deja un rico aroma y hidrata],"[(deja, NN), (un, JJ), (rico, NN), (aroma, NN)...","[deja, un, rico, aroma, y, hidrata]",[],
497,it’s work very well i recommend it 🥰🥰🥰🥰 great ...,5,[it’s work very well i recommend it 🥰🥰🥰🥰 great...,"[(it, PRP), (’, VBZ), (s, JJ), (work, NN), (ve...","[it, ’, s, work, very, well, i, recommend, it,...",[],
498,this really melts your makeup together as sett...,5,[this really melts your makeup together as set...,"[(this, DT), (really, RB), (melts, VB), (your,...","[this, really, melt, your, makeup, together, a...","[makeup, makeup]",makeup makeup


In [49]:
box = []
for words in df['cleaned_tokens']:
    box +=words

Counter(box).most_common(10)

[('skin', 176),
 ('product', 112),
 ('spray', 109),
 ('use', 97),
 ('like', 82),
 ('face', 66),
 ('smell', 54),
 ('feel', 42),
 ('look', 40),
 ('make', 37)]

In [50]:
from collections import Counter

# 별점 별로 상위 N개의 단어 추출하는 함수
def get_top_tokens_by_star(df, star_rating, top_n=10):
    # 특정 별점의 데이터 필터링
    star_df = df[df['Star'] == star_rating]
    # 정제된 토큰을 하나의 리스트로 결합
    all_tokens = [token for tokens in star_df['cleaned_tokens'] for token in tokens]
    # 빈도수 계산
    return Counter(all_tokens).most_common(top_n)

# 별점 1~5에 대한 상위 10개의 단어 추출 및 출력
for star in range(1, 6):
    print(f"\nTop 10 tokens for {star}-star reviews:")
    top_tokens = get_top_tokens_by_star(df, star)
    for token, freq in top_tokens:
        print(f"{token}: {freq}")



Top 10 tokens for 1-star reviews:
product: 30
skin: 18
smell: 16
use: 11
spray: 11
look: 8
make: 6
like: 6
glow: 6
day: 5

Top 10 tokens for 2-star reviews:
skin: 30
use: 22
like: 21
product: 18
smell: 17
spray: 16
fragrance: 12
face: 12
que: 12
get: 9

Top 10 tokens for 3-star reviews:
skin: 33
product: 19
spray: 18
face: 14
n't: 14
like: 13
use: 13
feel: 11
give: 9
one: 7

Top 10 tokens for 4-star reviews:
like: 30
skin: 29
use: 28
spray: 25
face: 12
product: 11
make: 11
say: 9
smell: 9
give: 8

Top 10 tokens for 5-star reviews:
skin: 66
spray: 39
product: 34
face: 24
use: 23
look: 23
love: 19
makeup: 15
feel: 15
skincare: 14
