In [36]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import os
import numpy as np
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import FloatType
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler, HashingTF, IDF, Tokenizer
from pyspark.sql.functions import explode, split
from pyspark.ml.linalg import Vectors
from pyspark.ml.linalg import DenseVector, SparseVector
from pyspark.sql.functions import udf
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [37]:
spark = SparkSession.builder.appName("text to csv").getOrCreate()

In [38]:
rdd = spark.sparkContext.wholeTextFiles("TextFiles/*.txt", use_unicode=True)

# convert rdd to dataframe
df = rdd.toDF(["filename", "text"])

# split the text column into separate columns
df = df.select(split("text", "\n").alias("columns"))

# select each column and add them to the dataframe
df = df.selectExpr("columns[0] as link", "columns[1] as heading", "columns[2] as time", "columns[3] as news")

# show the dataframe
df.show()

+--------------------+--------------------+--------------------+--------------------+
|                link|             heading|                time|                news|
+--------------------+--------------------+--------------------+--------------------+
|https://edition.c...|‘Nothing left to ...|2023-05-07 22:34:...|The head of the W...|
|https://edition.c...|Loaf-size mission...|2023-05-07 21:06:...|A new mission des...|
|https://edition.c...|Wagner boss sugge...|2023-05-07 18:39:...|Yevgeny Prigozhin...|
|https://edition.c...|Fire kills at lea...|2023-05-07 16:32:...|A fire in a small...|
|https://edition.c...|The lady in blue,...|2023-05-07 16:29:...|The coronation of...|
|https://edition.c...|Gold mine fire in...|2023-05-07 13:14:...|A fire in a small...|
|https://edition.c...|‘A Trump tribute ...|2023-05-07 23:50:...|Late last year, a...|
|https://edition.c...|Arab League readm...|2023-05-07 12:24:...|The Arab League h...|
|https://edition.c...|UK police face ba...|2023-05-07 

In [39]:
df = df.toPandas()
df.dropna(inplace = True)

In [42]:
cwd = os.getcwd()

In [46]:
txt_files = os.listdir('labeledData')
articles = []
labels = []
for file in txt_files:
    with open(f'labeledData/{file}', encoding='utf-8') as f:
        try:
            text = f.read()
            lines = text.split('\n')
            article = lines[-1]
            label = lines[1][1:]
            articles.append(article)
            labels.append(label)
        except:
            continue

In [47]:
articles_df = pd.DataFrame(np.array([articles, labels]).T, columns=['article', 'label'])

In [48]:
articles_df

Unnamed: 0,article,label
0,"Boston-based travelers, you’re in for a treat....",business
1,Nordstrom. Walmart. Whole Foods. Starbucks. CV...,business
2,An ex-ByteDance employee claimed he was wrongf...,business
3,"Pickleball is coming to your local mall, repla...",business
4,Leaders in Washington are running out of time ...,business
...,...,...
378,The world’s oldest dog is living it up in his ...,world
379,"Loonkiito, possibly one of the oldest lions in...",world
380,"“People say I’m a funny guy,” Finnish rapper K...",world
381,Germany announced on Saturday that it would su...,world


In [49]:
label_dict = {'business': 0,
               'entertainment': 1,
               'health': 2,
               'politics': 3,
               'sport': 4,
               'style': 5,
               'travel': 6,
               'world': 7}

In [50]:
articles_df['label'] = articles_df['label'].apply(lambda x: label_dict[x])

In [51]:
from sklearn.feature_extraction.text import CountVectorizer

In [60]:
def preprocess(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    return " ".join(tokens)

In [56]:
articles_df['article_preprocessed'] = articles_df['article'].apply(preprocess)

In [57]:
df = df.applymap(lambda x: x.replace('\r', '').strip())
df.dropna(inplace = True)

In [58]:
X_train, Y_Train = articles_df['article_preprocessed'], articles_df['label']

vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)

In [62]:
df_copy = df.copy()

In [63]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_vectorized, Y_Train)
df_copy['news'] = df_copy['news'].apply(preprocess)
df_copy['category'] = model.predict(vectorizer.transform(df_copy['news'] ))

df['category'] = df_copy['category'].apply(lambda x: list(label_dict.keys())\
                                      [list(label_dict.values()).index(x)])

In [64]:
df

Unnamed: 0,link,heading,time,news,category
0,https://edition.cnn.com/2023/05/06/europe/wagn...,‘Nothing left to grind the meat with’: Wagner ...,2023-05-07 22:34:00,The head of the Wagner private military compan...,world
1,https://edition.cnn.com/2023/05/07/world/nasa-...,Loaf-size mission launches to improve hurrican...,2023-05-07 21:06:00,A new mission designed to improve hurricane fo...,world
2,https://edition.cnn.com/2023/05/07/europe/prig...,Wagner boss suggests his forces may stay in Ba...,2023-05-07 18:39:00,"Yevgeny Prigozhin, head of the Wagner mercenar...",world
3,https://edition.cnn.com/2023/05/07/americas/pe...,Fire kills at least 27 in Peruvian gold mine,2023-05-07 16:32:00,A fire in a small gold mine in southern Peru h...,world
4,https://edition.cnn.com/2023/05/06/uk/coronati...,"The lady in blue, vanishing princes and the cr...",2023-05-07 16:29:00,The coronation of King Charles III on Saturday...,style
...,...,...,...,...,...
1094,https://edition.cnn.com/2023/05/02/sport/james...,James Harden scores 45 points to lift Joel Emb...,2023-05-02 05:57:00,Expectations were low for the Philadelphia 76e...,sport
1095,https://edition.cnn.com/2022/11/17/football/qa...,‘Our dreams never came true.’ These men helped...,"04:30 PM, Sun May 14, 2023",Kamal was standing outside a shop with other m...,sport
1096,https://edition.cnn.com/2022/05/11/sport/zhou-...,She apologized for verbally abusing an Olympic...,2022-05-12 03:45:00,"For Lisa Wright, an elite-level diving judge f...",sport
1097,https://edition.cnn.com/2022/07/25/sport/black...,Meet the Black women pushing for equality in s...,"04:30 PM, Sun May 14, 2023",Omie Dale has fond memories of splashing in th...,sport


In [67]:
df.to_csv("data.csv", index = False, mode = 'a')