In [17]:
# These are the libraries that we will use in this task
# You can install them using the following command:
# pip install clean-text
# pip install pandas
import re
import pandas as pd
from cleantext import clean 
import cleantext

data = 'https://raw.githubusercontent.com/several27/FakeNewsCorpus/master/news_sample.csv'
df = pd.read_csv(data) # Read the data into a DataFrame

# A subset of the DataFrame containing the rows from 100 to 200
subset_df = df.loc[100:200] 

# Regular expressions to match different types of URLs
url_pattern = re.compile(r'https?://\S+|www\.\S+|\S+\.com')
# Regular expressions to match the date format
date_pattern = re.compile(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}.\d{6})|'            # YYYY-MM-DD HH:MM:SS.MMMMMM
                        r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})|'                      # YYYY-MM-DD HH:MM:SS
                        r'(\d{4}-\d{2}-\d{2})|'                                        # YYYY-MM-DD
                        r'(\d{4}\.\d{2}\.\d{2})|'                                      # YYYY.MM.DD 
                        r'(\d{2}\.\d{2}\.\d{4})|'                                      # DD.MM.YYYY
                        r'(\d{4}/\d{2}/\d{2})|'                                        # YYYY/MM/DD
                        r'(\d{2}/\d{2}/\d{4})|'                                        # DD/MM/YYYY
                        r'((january|february|march|april|june|july|august|september|'  # <Month> DD YYYY
                        r'october|november|december) \d{2}, \d{4})', re.IGNORECASE)    # Ignore capitalization
# Regular expression to match numbers.
number_pattern = re.compile(r'(\d+(?:,\d{3})*(?:\.\d+)?)')


def cleaner_text(df):
    
    # Identify string columns
    string_columns = df.select_dtypes(include=["object"]).columns # his line finds all columns in the DataFrame df that have an object data type usually strings

    for col in string_columns:
        # Use vectorized string operations from pandas
        df[col] = (
            df[col]
            .replace(r'\s+', ' ', regex=True)
            .replace(url_pattern, '<URL>', regex=True)
            .replace(date_pattern, '<DATE>', regex=True)
            .replace(number_pattern, '<NUM>', regex=True)
            .apply(clean)
        )

    return df

subset_df.columns
#subset_df

#subset_df.drop(['id',''])



#subset_df.drop('')


Index(['Unnamed: 0', 'id', 'domain', 'type', 'url', 'content', 'scraped_at',
       'inserted_at', 'updated_at', 'title', 'authors', 'keywords',
       'meta_keywords', 'meta_description', 'tags', 'summary'],
      dtype='object')

In [9]:
import pandas as pd
import matplotlib.pyplot as plt
from nltk.stem import PorterStemmer
import timeit
ps = PorterStemmer()

preData = 'https://raw.githubusercontent.com/several27/FakeNewsCorpus/master/news_sample.csv'
preDf = pd.read_csv(data) # Read the unprocessed data into a DataFrame
postDf = cleaner_text(df) # Read the processed data into a DataFrame

# A function to get the number of words and the words themselves in each column of a DataFrame
# The function takes two arguments: the DataFrame and an option
# The option can be either 'ints' or 'words' if we want the number of words or the words themselves
def word_info(df, opt):
    text_series = pd.concat([df[col] for col in df.columns], ignore_index=True) # concatenate all the text columns into a single series
    if opt == 'ints':
        words_int = text_series.str.split().explode().nunique() # count the number of unique words
        return words_int
    elif opt == 'words':
        words = text_series.str.split().explode() # split the text into a series of words
        return words

# Count the frequency of each word and 
def wordSignificance(word):
    word_counts = word.value_counts() # Count the frequency of each word
    # Calculate the significance of each word
    # By dividing the frequency of each word by the total number of words
    word_counts = word_counts / len(word)
    return word_counts

# A function to get the top n% stopwords 
def topStopwords(wordSignificance, n):
    # Get the words that occur more than n% of the time
    stopword = wordSignificance[wordSignificance > n]
    stopword_list = stopword.index.tolist() # Convert the index to a list
    return stopword_list # Return the list of stopwords
topstopword_list = topStopwords(wordSignificance(word_info(postDf, 'words')),0.005)

# A function to get the bottom n% stopwords
def botStopwords(wordSignificance, n):
    # Get the words that occur more than n% of the time
    stopword = wordSignificance[wordSignificance < n]
    stopword_list = stopword.index.tolist() # Convert the index to a list
    return stopword_list # Return the list of stopwords
botstopword_list = botStopwords(wordSignificance(word_info(postDf, 'words')),0.00001)


# A function to remove stopwords from our dataframe
def removeStopwords(stopwords, df_WIP):
    #df_WIP = df.copy() # Make a copy of the DataFrame for use in this function
    for col in df_WIP.columns: # Loop over all columns in the DataFrame
        # If any of the stopwords are in the text, replace them with an empty string
        if df_WIP[col].dtype == 'O': # Check if the column is a string (i.e., object or string dtype)
            # remove stopwords from the text in each column of the DataFrame using the apply function and lambda expression
            df_WIP[col] = df_WIP[col].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords]))
    return df_WIP
nostopword_df = removeStopwords(topstopword_list + botstopword_list, postDf) # Read the data without stopwords into a DataFrame


def stem_word(word):
    return ' '.join([ps.stem(w) for w in word.split()])

def df_stemmer(df_WIP):
    # Only select 'object' dtype columns
    object_columns = df_WIP.select_dtypes(include=['object'])

    # Apply stemming to every element in the object_columns DataFrame
    stemmed_columns = object_columns.applymap(stem_word)

    # Update the original DataFrame with the stemmed columns
    df_WIP.update(stemmed_columns)

    return df_WIP

stemmed_df = df_stemmer(nostopword_df)



"""
# A function to plot the top 50 most frequent words
def plot(words, title, colors, max_words):
    # count the frequency of each word
    word_counts = words.value_counts()

    # sort the list of words by frequency
    word_counts = word_counts.sort_values(ascending=False)

    # plot a bar chart of the 50 most frequent words
    top_words = word_counts[:max_words] # get the top 50 words
    fig, ax = plt.subplots(figsize=(12, 6))
    ax.bar(top_words.index, top_words.values, color=colors, width=0.8, align='edge') 
    ax.set_xticks(range(len(top_words))) # set the x-ticks to the word positions
    ax.set_xticklabels(top_words.index, rotation=90)
    ax.set_xlabel('Word/Tokens')
    ax.set_ylabel('Frequency of Occurrence')
    ax.set_xlim(-0.5, len(top_words) - 0.5) # add a gap at the beginning and end
    plt.title(title)
    plt.show()



### DF info ###
print('\n### Unique words in the data before and after preprocessing ###\n')
print('Unique words before preprocessing: ', word_info(preDf, 'ints')) # 31808
print('Unique words after preprocessing: ', word_info(postDf, 'ints')) # 26628
print('Unique words after removing stopwords: ', word_info(nostopword_df, 'ints')) 
print('')
print('Reduction in size of vocabulary: ', (word_info(postDf, 'ints')) - (word_info(nostopword_df, 'ints')))
print('Reduction %: ', round(((word_info(postDf, 'ints')) - (word_info(nostopword_df, 'ints')))/word_info(postDf, 'ints')*100, 5), '%')
print('')
print('Reduction in size of vocabulary after stemming: ', (word_info(postDf, 'ints')) - (word_info(stemmed_df, 'ints')))
print('Reduction %: ', round(((word_info(postDf, 'ints')) - (word_info(stemmed_df, 'ints')))/word_info(postDf, 'ints')*100, 5), '%')
print('')
print('Unique words after stemming & removing stopwords: ', word_info(stemmed_df, 'ints'))

### Plots ###
plot(word_info(preDf, 'words'), 'Top 50 most frequent words in the data before preprocessing', 'red', 50)
plot(word_info(postDf, 'words'), 'Top 50 most frequent words in the data after preprocessing', 'green', 50)
plot(word_info(nostopword_df, 'words'), 'Top 50 most frequent words in the data after removing stopwords', 'blue', 50)

"""

"\n# A function to plot the top 50 most frequent words\ndef plot(words, title, colors, max_words):\n    # count the frequency of each word\n    word_counts = words.value_counts()\n\n    # sort the list of words by frequency\n    word_counts = word_counts.sort_values(ascending=False)\n\n    # plot a bar chart of the 50 most frequent words\n    top_words = word_counts[:max_words] # get the top 50 words\n    fig, ax = plt.subplots(figsize=(12, 6))\n    ax.bar(top_words.index, top_words.values, color=colors, width=0.8, align='edge') \n    ax.set_xticks(range(len(top_words))) # set the x-ticks to the word positions\n    ax.set_xticklabels(top_words.index, rotation=90)\n    ax.set_xlabel('Word/Tokens')\n    ax.set_ylabel('Frequency of Occurrence')\n    ax.set_xlim(-0.5, len(top_words) - 0.5) # add a gap at the beginning and end\n    plt.title(title)\n    plt.show()\n\n\n\n### DF info ###\nprint('\n### Unique words in the data before and after preprocessing ###\n')\nprint('Unique words before

In [104]:
#print(stemmer(nostopword_df))

In [103]:
#stemmed_df

In [10]:
input_file = '/volumes/Glyph1TB/newsCorpus/news_cleaned_2018_02_13.csv'
output_file = '/volumes/Glyph1TB/newsCorpus/news_cleaned_2018_02_13-results.csv'

#input_file = '/volumes/Glyph1TB/newsCorpus/Wiki_news.csv'
#output_file = '/volumes/Glyph1TB/newsCorpus/Wiki_news_cleaned.csv'

chunksize = 1000 


with pd.read_csv(input_file, chunksize=chunksize, encoding='utf-8', lineterminator='\n') as reader:
    with open(output_file, "a") as out:
        first_chunk = True
        for chunk in reader:
            cleaned_chunk = cleaner_text(chunk.copy())
            no_stopwords_chunk = removeStopwords(topstopword_list + botstopword_list, cleaned_chunk) 
            stemmed_chunk = df_stemmer(no_stopwords_chunk) 
            
            # Write the cleaned chunk to the output file
            if first_chunk:
                stemmed_chunk.to_csv(out, index=False, encoding='utf-8')
                first_chunk = False
            else:
                stemmed_chunk.to_csv(out, mode='a', header=False, index=False, encoding='utf-8')

KeyboardInterrupt: 