### output_en.csvの内容のcontentを要約して新たなcsvファイルであるsumy.csvを出力する

In [42]:
import pandas as pd
import nltk
import spacy


# フィードデータの読み込み
df = pd.read_csv('data/output_en.csv')
# 確認
df


Unnamed: 0,url,title,link,summary,content
0,https://news.ycombinator.com/rss,I'm Shadow Banned by DuckDuckGo (and Bing),https://daverupert.com/2023/01/shadow-banned-b...,Comments,\n\n\n\n\nI'm Shadow Banned by DuckDuckGo (and...
1,https://news.ycombinator.com/rss,SLT – A Common Lisp Language Plugin for Jetbra...,https://github.com/Enerccio/SLT,Comments,\n\n\n\n\n\n\n\n\nEnerccio\n\n/\n\nSLT\n\nPubl...
2,https://news.ycombinator.com/rss,Ask HN: How do you trust that your personal ma...,https://news.ycombinator.com/item?id=34388866,Comments,\n\nAsk HN: How do you trust that your persona...
3,https://news.ycombinator.com/rss,Ubuntu 22.04 LTS servers and phased apt updates,https://utcc.utoronto.ca/~cks/space/blog/linux...,Comments,\n \n Chris's Wiki :: blog/linux/Ubuntu2204Ser...
4,https://news.ycombinator.com/rss,Single-file scripts that download their depend...,https://dbohdan.com/scripts-with-dependencies,Comments,\n\n\n\nSingle-file scripts that download thei...
...,...,...,...,...,...
1740,https://news.ycombinator.com/rss,Tracy: A hybrid frame and sampling profiler fo...,https://github.com/wolfpld/tracy,Comments,\n\n\n\n\n\n\n\n\nwolfpld\n\n/\n\ntracy\n\nPub...
1741,https://news.ycombinator.com/rss,Naval Architecture (2021),https://ciechanow.ski/naval-architecture/,Comments,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
1742,https://news.ycombinator.com/rss,"Apple sued for promising privacy, failing at it",https://www.theregister.com/2023/01/28/apple_s...,Comments,403 forbidden. The Register apologises but yo...
1743,https://news.ycombinator.com/rss,Ask HN: What is the best source to learn Docke...,https://news.ycombinator.com/item?id=34563353,Comments,\n\nAsk HN: What is the best source to learn D...


In [43]:
import re
from nltk.corpus import wordnet

symbols_to_remove = r'["`,.' + r"'" + r']'
stop_words = nltk.corpus.stopwords.words('english')
stop_words += ["'", '"', ':', ';', '.', ',', '-', '!', '?', "'s", '`', '•', '%']
stop_words += ['–', '—', '‘', '’', '“', '”', '…', '|', '#', '$', '&', "''", '(', ')']
stemmer = nltk.stem.porter.PorterStemmer()
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

# 品詞の名称を変換
def wordnet_tag(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    return None

def preprocess(text):
    tokens = []
    # 品詞のタグ付けをした各トークンについて
    for t in nltk.pos_tag(nltk.tokenize.word_tokenize(text.replace('-', ' '))):
        # 小文字化
        t0 = t[0].lower()
        # 不要な文字の削除
        t0 = re.sub(symbols_to_remove, '', t0)
        # 空文字列になったら次へ
        if t0 == '':
            continue
        # stop_words に含まれていないトークンのみを残す
        if t0 in stop_words:
            continue
        # カンマ区切りが入った数値からカンマを削除
        if t[1] == 'CD':
            t0 = t0.replace(',', '')
        # 見出し語化
        tag = wordnet_tag(t[1])
        if tag is None:
            t0 = lemmatizer.lemmatize(t0)
        else:
            t0 = lemmatizer.lemmatize(t0, tag)
        # ステミング
        t0 = stemmer.stem(t0)
        # リストに追加
        tokens.append(t0)
    # トークンのリストを返す
    return tokens

In [44]:
import spacy
import re
import unicodedata

class EnglishCorpus:
    def __init__(self):
        self.nlp = spacy.load('en_core_web_sm')
    def preprocessing(self, text):
        text = re.sub(r'\n', '', text)
        text = re.sub(r'\r', '', text)
        return text
    
    def make_sentence_list(self, sentences):
        doc = self.nlp(sentences)
        self.ginza_sents_object = doc.sents
        sentence_list = [s for s in doc.sents]
        return sentence_list
    def make_corpus(self):
        corpus = []
        for s in self.ginza_sents_object:
            tokens = [str(t) for t in s]
            corpus.append(' '.join(tokens))
        return corpus

In [45]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.utils import get_stop_words
# algorithms
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.kl import KLSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.reduction import ReductionSummarizer
from sumy.summarizers.sum_basic import SumBasicSummarizer
algorithm_dic = {"lex": LexRankSummarizer(), "tex": TextRankSummarizer(), "lsa": LsaSummarizer(),\
                 "kl": KLSummarizer(), "luhn": LuhnSummarizer(), "redu": ReductionSummarizer(),\
                 "sum": SumBasicSummarizer()}

def summarize_sentences(sentences, sentences_count=3, algorithm="lex", language="english"):
    corpus_maker = EnglishCorpus()
    preprocessed_sentences = corpus_maker.preprocessing(sentences)
    preprocessed_sentence_list = corpus_maker.make_sentence_list(preprocessed_sentences)
    corpus = corpus_maker.make_corpus()
    parser = PlaintextParser.from_string(" ".join(corpus), Tokenizer(language))
    try:
        summarizer = algorithm_dic[algorithm]
    except KeyError:
        print("algorithm name:'{}'is not found.".format(algorithm))
    summarizer.stop_words = get_stop_words(language)
    summary = summarizer(document=parser.document, sentences_count=sentences_count)
    return " ".join([sentence.__str__() for sentence in summary])

In [46]:
def generate_sumy_result(content):
    sentences_count = 3
    algorithm = "lex"
    language="english"
    sum_sentences = summarize_sentences(content, sentences_count=sentences_count, algorithm=algorithm, language=language)
    print(sum_sentences)
    return sum_sentences


In [47]:

# 各行に対して要約を生成
df['sumy_result'] = df.apply(lambda row: generate_sumy_result(row['content']), axis=1)
# 出力
target_df = df.drop(['url', 'summary', 'content'], axis=1)
target_df.to_csv('./sumy.csv', mode='w', header=True)

I 'm Shadow Banned by DuckDuckGo ( and Bing)January 14 , 2023It came to my attention that my site does not appear on DuckDuckGo search results . After some digging , DuckDuckGo used to get their site index from Yandex , but now gets their site index from Bing and sure enough … I did n’t appear on Bing either . To solve this , I took the first step and signed up for Bing Webmaster Tools to try to know what Bing knows about my site and sure enough : zero clicks , zero impressions , and zero indexed pages for my site .
Enerccio / SLTPublic Notifications Fork     0            Star 26           SLT is an IDE Plugin for Itellij / Jetbrains IDE lineup implementing support for Common Lisp via SBCL and Slime / Swank        License      Apache-2.0 license     26           stars 0           forks             Star    NotificationsCodeIssues4Pull requests0ActionsProjects0SecurityInsights   More                    Code                    Issues                    Pull requests                    Act