In [None]:
import pandas as pd
import preprocessor as p
import regex as re
import torch
import torchtext
import torchdata
import portalocker
from wordsegment import load, segment
load()

In [None]:
def hash_fix(h):
    """
    Extracts hashtags from a tweet
    """
    h1 = re.sub(r'[0-9]+', '', h)
    h2 = re.sub(r'#', '', h1)
    h3 = segment(str(h2))
    h4 = ' '.join(map(str, h3)) 
    return h4

# Inputs: dataframe with the tweets and the column with the hashtags
def hash_dict(df,hash_col):
    """
    Creates a hashtag dictionary mapping concatenated hashtag
    to segmented words
    
    """
    # Create a datafame of all hashtags in a column and their counts
    # Note: hashtags are in lists inside a cell e.g. [#hash1, #hash2] 
    tag_counts = df[hash_col].apply(pd.Series).stack().value_counts().to_frame()
    tag_counts = tag_counts.reset_index()
    tag_counts.columns = ['hash','freq']
    # Remove numbers and segment multiple words using hash fix
    tag_counts = tag_counts.assign(clean_tag = tag_counts.hash.apply(lambda x: hash_fix(x)))
    # Create a dictionary of the hashtags and their clean strings
    tag_counts.set_index('hash', inplace=True)
    tag_dict = tag_counts['clean_tag'].to_dict()
    return tag_dict

def preprocess_tweet_hash(row):
    """
    preprocesses a tweet
    """
    p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION,
              p.OPT.RESERVED, p.OPT.SMILEY, p.OPT.NUMBER)
    try: 
        text = row['text_hash_split'].replace("&amp", "and")
        text = p.clean(text)
    except AttributeError: 
        return None
    
    return text

def preprocess_tweet_no_hash(row):
    """
    preprocesses a tweet
    
    """
    p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION,
              p.OPT.RESERVED, p.OPT.SMILEY, p.OPT.NUMBER, p.OPT.HASHTAG)
    text = row['text'].replace("&amp", "and")
    text = p.clean(text)
    return text

def clean_city(c, df, text_column):
    """
    cleans tweets corresponding to a certain city
    """
    print("beginning city: ", c)
    if c is not None:
        tweets_analysis = df.loc[(df.loc[:, "city"] == c), :]
    else: 
        tweets_analysis = df
    #tweets_analysis["hashtag"] = tweets_analysis[text_column].apply(lambda x: re.findall(r"#(\w+)", x))
    tweets_analysis['hashtag'] = tweets_analysis[text_column].str.findall(r"#(\w+)")
    hashtags = []
    for row in tweets_analysis["hashtag"]:
        try:
            for h in row: 
                hashtags.append(h)
        except TypeError:
            continue
    hashtags = pd.DataFrame(hashtags)
    hashtags.rename(columns = {0: "hashtags"}, inplace = True)
    hashtags["hashtags"] = "#"+hashtags["hashtags"]
    tweets_analysis["text_hash_split"] = tweets_analysis[text_column]
    print("replacing hashtags")
    tag_dict = hash_dict(hashtags,'hashtags')
    print("made hash dictionary")
    #print(tag_dict)
    tweets_analysis.astype({"text_hash_split": str})
    #tweets_analysis.text_hash_split.as_type(str)
    print("beginning replacement process")
    tweets_analysis.text_hash_split.replace(tag_dict, regex=True, inplace= True)
    #print(tweets_analysis["text_hash_split"])
    print("done with hashtags")
    tweets_analysis['text_with_hash'] = tweets_analysis.apply(preprocess_tweet_hash, axis=1)
    #print(tweets_analysis["text_with_hash"])
    tweets_analysis['text_with_hash'] = tweets_analysis['text_with_hash'
                                                       ].str.lower().str.replace('[^\w\s]',
                                                                                 ' ').str.replace('\s\s+', ' ')
    return tweets_analysis
    

def clean_cities(input_filename, output_filename): 
    """
    cleans entire file 
    """
    df = pd.read_csv(input_filename)
    final_array = []
    cities = df.city.unique()
    for c in cities:
        dfc = df.copy()
        try: 
            new_array = clean_city(c, dfc, "text")
        except KeyError: 
            continue
        final_array.append(new_array)
    final_cities = pd.concat(final_array)
    final_cities.rename(columns = {"text_with_hash": "text_clean"}, inplace = True)
    final_cities.to_csv(output_filename)
    return final_cities


In [None]:
#hashtags = cleaning("city_tweets_large.csv", "text", "city_tweets_hashtags_clean.csv")

In [None]:
final_cities = clean_cities("city_tweets_hashtags2.csv", "city_tweets_hashtags2_clean.csv")

In [None]:
#cleaning tweets that are not 

final_tweets = pd.DataFrame()
final_array = []
#cities = df.city.unique()
for i in range(30):
    beginning_index = i*2000
    end_index = i*2000 + 2000
    print("beginning at", beginning_index)
    print("ending at", end_index)
    dfc = new_tweets.copy()
    new_array = clean_city(None, dfc.loc[beginning_index: end_index, :], "text")
    final_array.append(new_array)
final_tweets = pd.concat(final_array)
final_tweets.rename(columns = {"text_with_hash": "text_clean"}, inplace = True)