In [1]:
import json

import pandas as pd
import numpy as np
import scipy.sparse as ss
import matplotlib.pyplot as plt

import corextopic.corextopic as ct
import corextopic.vis_topic as vt # jupyter notebooks will complain matplotlib is being loaded twice

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
import sys
import csv
np.set_printoptions(threshold=sys.maxsize)
from scipy.stats import entropy

In [2]:
# load the data from file
def load_csv(filename):
    res = []
    data = pd.read_csv(filename, encoding='utf-8')
    votes = []
    actual_rating = []
    customer_id = []
    review_id = []
    for line in data.values:
        res.append(str(line[13]).lower())
        votes.append((int(line[8]), int(line[9] - int(line[8]))))
        actual_rating.append(int(line[7]))
        customer_id.append(line[1])
        review_id.append(line[2])
    return res, votes, actual_rating, customer_id, review_id


# data, votes, actual_rating, customer_id, review_id = load_csv("electronics_votes10more_reviews.csv")
# print(len(data))

121649


In [4]:
data = pd.read_csv("amazon_reviews_us_Electronics_v1_00.tsv", sep='\t', on_bad_lines='skip')

In [3]:
# load the data from file
def load_tsv(filename):
    res = []
    data = pd.read_csv(filename, encoding='utf-8', sep='\t', on_bad_lines='skip')
    votes = []
    actual_rating = []
    customer_id = []
    review_id = []
    for line in data.values:
        res.append(str(line[13]).lower())
        votes.append((int(line[8]), int(line[9] - int(line[8]))))
        actual_rating.append(int(line[7]))
        customer_id.append(line[1])
        review_id.append(line[2])
    return res, votes, actual_rating, customer_id, review_id

data, votes, actual_rating, customer_id, review_id = load_tsv("amazon_reviews_us_Electronics_v1_00.tsv")
print(len(votes))

3091024


In [None]:
pd.options.display.max_columns = None
data = pd.read_stata("yelp.dta")
print(data.head)

In [None]:
def load_json(filename):
    res = []
    file = open(filename)
    data = json.load(file)
    for i in data:
        res.append(i["review"])
    return res

In [4]:
data, votes, actual_rating, customer_id, review_id = load_tsv("amazon_reviews_us_Electronics_v1_00.tsv")
# vectorize the words in the articles into binary
vectorizer = CountVectorizer(stop_words='english', max_features=20000, binary=True)
doc_word = vectorizer.fit_transform(data)
doc_word = ss.csr_matrix(doc_word)
print(doc_word.shape)


num_topic = 10
# get all the words
words = list(np.asarray(vectorizer.get_feature_names_out()))

# get rid of all the digits
not_digit_inds = [ind for ind,word in enumerate(words) if not word.isdigit()]
doc_word = doc_word[:,not_digit_inds]
words    = [word for ind,word in enumerate(words) if not word.isdigit()]

print(doc_word.shape)


(3091024, 20000)
(3091024, 19518)


In [10]:
# load keywords
anchor_words = []
fh = open("Initial Keywords Electronics 2.txt")
anchor = fh.readlines()
for i in range(len(anchor)):
    if i % 3 == 1:
        anchor_words.append(anchor[i].rstrip("\n").split(" "))

len(anchor_words)

10

In [13]:


# set the number of topics to be 10
topic_model = ct.Corex(n_hidden=num_topic, words=words, max_iter=2000, verbose=False, seed=1)
topic_model.fit(doc_word, words=words, anchors=anchor_words, anchor_strength=10)

# Print all topics from the CorEx topic model
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_,_ = zip(*topic)
    print('{}: '.format(n) + ', '.join(topic_words))



0: bought, purchased, pair, year, ordered, cost, month, paid, owned, lasted
1: br, just, like, don, use, music, ve, want, way, using
2: tv, cable, hdmi, receiver, connect, setup, connected, input, output, hooked
3: listen, wife, son, daughter, husband, sitting, bed, morning, walking, walk
4: time, battery, turn, second, button, setting, alarm, mode, press, turning
5: sound, bass, clarity, treble, mids, distortion, headphones, muddy, midrange, reproduction
6: plastic, rubber, slip, headband, housing, bend, earpiece, silicone, strap, bent
7: quality, good, price, really, better, overall, nice, looking, think, look
8: product, company, customer, seller, manufacturer, contacted, sorry, email, exchange, vendor
9: mp3, software, download, itunes, file, format, library, downloaded, podcasts, content


In [14]:
prob = topic_model.predict_proba(doc_word)[0]


L = []
L.append(["customer_id", "review_id", "actual_rating", "upvote", "down_vote", "original_content", "prob_a1", "prob_a2", "prob_a3", "prob_a4", "prob_a5", "prob_a6", "prob_a7", "prob_a8", "prob_a9", "prob_a10", "entropy", "length"])
for i in range(3091024):
    LL = []
    LL.append(customer_id[i])
    LL.append(review_id[i])
    LL.append(actual_rating[i])
    LL.append(votes[i][0])
    LL.append(votes[i][1])
    LL.append(data[i])
    for probaility in prob[i]:
        #LL.append("%.4f" % (probaility*100) + "%" + "  ")
        LL.append(probaility)
    ent = entropy(prob[i], base=2)
    LL.append(ent)
    LL.append(len(data[i].split()))
    

    L.append(LL)

with open('electronics_review_seed2.csv', 'a+', encoding='UTF8') as f:
    writer = csv.writer(f)
    writer.writerows(L)
    f.close()

In [15]:
data = pd.read_csv("electronics_review_seed2.csv", on_bad_lines='skip')
data.shape

(3091024, 18)

In [8]:
data.shape

(3091024, 18)

In [None]:
data = load_json("yelp.json")
# vectorize the words in the articles into binary
vectorizer = CountVectorizer(stop_words='english', max_features=20000, binary=True)
doc_word = vectorizer.fit_transform(data)
doc_word = ss.csr_matrix(doc_word)

num_topic = 9
# get all the words
words = list(np.asarray(vectorizer.get_feature_names_out()))

# get rid of all the digits
not_digit_inds = [ind for ind,word in enumerate(words) if not word.isdigit()]
doc_word = doc_word[:,not_digit_inds]
words    = [word for ind,word in enumerate(words) if not word.isdigit()]

# set the number of topics to be 10
topic_model = ct.Corex(n_hidden=num_topic, words=words, max_iter=2000, verbose=False, seed=1)
topic_model.fit(doc_word, words=words)

# Print all topics from the CorEx topic model
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_,_ = zip(*topic)
    print('{}: '.format(n) + ', '.join(topic_words))

prob = topic_model.predict_proba(doc_word)[0]
print(prob)


L = []
L.append(["Review", "prob_a1", "prob_a2", "prob_a3", "prob_a4", "prob_a5", "prob_a6", "prob_a7", "prob_a8", "prob_a9", "entropy", "length"])
for i in range(939580):
    LL = []
    LL.append(data[i])
    for probaility in prob[i]:
        #LL.append("%.4f" % (probaility*100) + "%" + "  ")
        LL.append(probaility)
    ent = entropy(prob[i], base=2)
    LL.append(ent)
    LL.append(len(data[i].split()))
    

    L.append(LL)

with open('yelp.csv', 'a+', encoding='UTF8') as f:
    writer = csv.writer(f)
    writer.writerows(L)
    f.close()



In [19]:
exchange_rate = 0.1456

rmb_amount = float(input("请输入人民币金额：")[:-1])
usd_amount = rmb_amount * exchange_rate

print("{}元人民币可以兑换{:.2f}美元".format(rmb_amount, usd_amount))


1234.5元人民币可以兑换179.74美元


In [6]:
a = int(input("输入a"))
b = int(input("输入b"))

res = []  # 记录所有回文数
for i in range(a, b+1): 
    if str(i) == str(i)[::-1]:
        res.append(str(i))

count = 0 #设置初始计数
for j in range(len(res)):
    print(res[j], end=' ')
    count += 1 #开始计数
    if count % 5 == 0: #每5个换行
        print(end='\n')
        
    

101 111 121 131 141 
151 161 171 181 191 


In [15]:
def check_password_strength(password):
    contain_big_letter = False
    contain_small_letter = False
    contain_number = False
    contain_symbol = False
    longer_than_8 = False

    for letter in "ABCDEFGIJKLMNOPQRSTUVWXYZ":
        if letter in password:
            contain_big_letter = True

    for letter in "abcdefghijklmnopqrstuvwxyz":
        if letter in password:
            contain_small_letter = True

    for number in "0123456789":
        if number in password:
            contain_number = True

    for symbol in '.,/!;:?<>':
        if symbol in password:
            contain_symbol = True

    if len(password) >= 8:
        longer_than_8 = True

    res = [contain_big_letter, contain_small_letter, contain_number, contain_symbol, longer_than_8]

    return res.count(True)

In [16]:
check_password_strength("P123")

2

In [18]:
def caesar_cipher(offset):
    offset = offset % 26  # 确保偏移量在0到25之间

    # 生成字母表
    uppercase_letters = [chr(ord('A') + i) for i in range(26)]
    lowercase_letters = [chr(ord('a') + i) for i in range(26)]

    # 根据偏移量生成加密字母表
    shifted_uppercase = uppercase_letters[offset:] + uppercase_letters[:offset]
    shifted_lowercase = lowercase_letters[offset:] + lowercase_letters[:offset]

    # 创建字母映射字典
    mapping = {}
    for i in range(26):
        mapping[uppercase_letters[i]] = shifted_uppercase[i]
        mapping[lowercase_letters[i]] = shifted_lowercase[i]

    return mapping


def encrypt():
    offset = int(input("请输入偏移数目: "))
    sentence = input("输入明文: ").lower()
    mapping = caesar_cipher(offset)
    res = []
    for letter in sentence:
        res.append(mapping[letter])
    return "".join(res)

encrypt()


'vxqgdb'