In [1]:
# Start with loading all necessary libraries

import pandas as pd
import numpy as np
import re
import nltk
import sklearn
import os

In [None]:
# Load data from a CSV file into the dataframe
dataFile = ''

tweets_5Gandauction = dataFile

tweet_5Gandauction_data = pd.read_csv(tweets_5Gandauction, sep = ",", engine='python')

In [None]:
# Drop the unnecesary columns. In this example I am only interested in the text and need to protect the user information.

tweet_5Gandauction_data = tweet_5Gandauction_data.drop(columns =['from_user','id_str','geo_coordinates','created_at','time' ,'user_lang','in_reply_to_user_id_str', 'in_reply_to_screen_name', 'from_user_id_str', 'in_reply_to_status_id_str', 'source','profile_image_url' ,'user_followers_count', 'user_friends_count', 'status_url', 'entities_str' ])
tweet_5Gandauction_data.head(10)

In [None]:
# Prints only the text column
tweet_5Gandauction_data['text'].head()

In [None]:
# Counts how many tweets (rows) are in my data file
tweet_5Gandauction_data.shape

In [None]:
# NLTK is a platform for building Python programs to work with human language data. 
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [None]:
# A token is a piece of a whole, so a word is a token in a sentence, and a sentence is a token in a paragraph. 
# Tokenization is the process of splitting a string into a list of tokens.

tweet_tokenizer = TweetTokenizer()

word_tokens = tweet_5Gandauction_data['text'].apply(lambda x : tweet_tokenizer.tokenize(x))
word_tokens.head()

In [None]:
def lower_case(x):
    return  [words.lower() for words in x]
    
lower_tokens = word_tokens.apply(lambda x : lower_case(x))
lower_tokens.head(10)

In [None]:
# Removes usernames from the text column
def remove_username(x):
    return  [re.sub(r'(?i)@[a-z0-9_]+', "", words, flags=re.MULTILINE) for words in x]

no_username = lower_tokens.apply(lambda x : remove_username(x))
no_username.head(10)

In [None]:
# Removes tags inside the text column
def remove_tags(x):
    return  [re.sub(r'(?i)RT^@[a-z0-9_]+', "", words, flags=re.MULTILINE) for words in x]

no_retweet_tags = no_username.apply(lambda x : remove_tags(x))
no_retweet_tags.head(10)

In [None]:
# Remove url inside the text column 
def remove_url(x):
    return  [re.sub(r'(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w\.-]*)*\/?\S', "", words, flags=re.MULTILINE) for words in x]

no_urls = no_retweet_tags.apply(lambda x : remove_url(x))
no_urls.head(10)

In [None]:
lemmatizer = WordNetLemmatizer()

def get_lemma(x):
    lemmatizer = WordNetLemmatizer()
    return  [lemmatizer.lemmatize(words) for words in x]

lemmatized_tokens = no_urls.apply(lambda x : get_lemma(x))
lemmatized_tokens.head(10)

In [None]:
# Import the stopwords list in english. A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine has been programmed to ignore, both when indexing entries for searching and when retrieving them as the result of a search query.
from nltk.corpus import stopwords
stop_words = list(stopwords.words('english'))
stop_words

In [None]:
def remove_stopwords(x, stop_words):
    return [words for words in x if not words in stop_words]

    
stopwords_removed = lemmatized_tokens.apply(lambda x: remove_stopwords(x, stop_words))
stopwords_removed.head(10)


In [None]:
def remove_punctuations(x):
    return  [words for words in x if words.isalpha()]

no_punct = stopwords_removed.apply(lambda x : remove_punctuations(x))
no_punct.head(10)

In [None]:
from nltk.corpus import stopwords
stop_words = list(stopwords.words('english'))

tweet_tokenizer = TweetTokenizer()

def tweet_preprocessor(text):
    stop_words = list(stopwords.words('english'))
    tweet_tokenizer = TweetTokenizer()
    lemmatizer = WordNetLemmatizer()
    
    clean_text = tweet_tokenizer.tokenize(text)
    clean_text = lower_case(clean_text)
    clean_text = remove_tags(clean_text)
    clean_text = remove_username(clean_text)
    clean_text = remove_url(clean_text)
    clean_text = remove_stopwords(clean_text, stop_words)
    clean_text = get_lemma(clean_text)
    clean_text = remove_punctuations(clean_text)
    
    clean_text = ', '.join(words for words in clean_text)
    
    return clean_text

In [None]:
tweet_5Gandauction_data['clean_text'] = tweet_5Gandauction_data['text'].apply(lambda x : tweet_preprocessor(x))

In [None]:
tweet_5Gandauction_data['clean_text'].head(10)

In [None]:
tweet_5Gandauction_data['word_count'] = tweet_5Gandauction_data['clean_text'].apply(lambda x : len(x))
tweet_5Gandauction_data = tweet_5Gandauction_data[tweet_5Gandauction_data['word_count'] > 0]

In [None]:
tweet_5Gandauction_data['clean_text'].head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
pd.options.display.max_colwidth = 1000
pd.set_option('display.max_columns', None)

tfidf_vectorizer = TfidfVectorizer(preprocessor = tweet_preprocessor,
                                   max_df=0.75, min_df=3, use_idf =True)

text_tfidf = tfidf_vectorizer.fit_transform(tweet_5Gandauction_data['text'])

In [None]:
occ = np.asarray(text_tfidf.sum(axis=0)).ravel().tolist()
tfidf_scores = pd.DataFrame({'terms': tfidf_vectorizer.get_feature_names(), 'tfidf': occ})

tfidf_scores.sort_values('tfidf',ascending=False)

In [None]:
# Count which words are the most popular and how many times are mentioned in the collected tweets.

count_vectorizer = CountVectorizer(preprocessor = tweet_preprocessor)
count_matrix =count_vectorizer.fit_transform(tweet_5Gandauction_data['text'])

occ = np.asarray(count_matrix.sum(axis=0)).ravel().tolist()
freq_counts = pd.DataFrame({'terms': count_vectorizer.get_feature_names(), 'Freq': occ})

freq_counts.sort_values('Freq',ascending=False)

In [None]:
tweet_5Gandauction_data.head(10)

In [None]:
tweet_5Gandauction_data = tweet_5Gandauction_data.drop(columns ='text')
tweet_5Gandauction_data.head(10)

In [None]:
tweet_5Gandauction_data.shape

In [None]:
# Remove duplicated Tweets

tweet_5Gandauction_data = tweet_5Gandauction_data.drop_duplicates(subset='clean_text', keep='first')
tweet_5Gandauction_data.head(10)

In [None]:
# Count how many tweets we have after removing diplicates
tweet_5Gandauction_data.shape

In [None]:
# Count terms frequency after removing duplicates

count_vectorizer = CountVectorizer(preprocessor = tweet_preprocessor)
count_matrix = count_vectorizer.fit_transform(tweet_5Gandauction_data['clean_text'])

occ = np.asarray(count_matrix.sum(axis=0)).ravel().tolist()
freq_counts = pd.DataFrame({'terms': count_vectorizer.get_feature_names(), 'Freq': occ})

freq_counts.sort_values('Freq',ascending=False)


In [None]:
# Import Wordcloud and Marplot to create our wordcloud.
# Here we are putting all the words together and removit rt initials for retweet. 

from wordcloud import WordCloud, STOPWORDS 
import matplotlib.pyplot as plt

comment_words = ''
stopwords = set(STOPWORDS)

for row in tweet_5Gandauction_data['clean_text']:
    row = str(row)
    row = row.replace(',', '')
    tokens = row.split()
    
    for i in range(len(tokens)):
        tokens[i] = tokens[i].lower()
        if (tokens[i] != 'rt'):
            comment_words += tokens[i] + " "
        
    
print(comment_words)

In [None]:
# Create the wordcloud

wordcloud = WordCloud(width = 800, height = 800,
                     background_color = "white",
                     min_font_size = 10,
                     max_words = 100).generate(comment_words)

plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud, interpolation="bilinear") 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show() 

In [None]:
# Saves the wordcloud image to a destination folder
destinationFolder = ''

wordcloud.to_file(destinationFolder)