In [1]:
import pandas as pd
import string
import re
import pickle

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics.pairwise import linear_kernel

df = pd.read_csv("tweets.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,date,id,link,retweet,text,author
0,0,Oct 7,784609194234306560,/realDonaldTrump/status/784609194234306560,False,Here is my statement.pic.twitter.com/WAZiGoQqMQ,DonaldTrump
1,1,Oct 10,785608815962099712,/realDonaldTrump/status/785608815962099712,False,Is this really America? Terrible!pic.twitter.c...,DonaldTrump
2,2,Oct 8,784840992734064640,/realDonaldTrump/status/784840992734064641,False,The media and establishment want me out of the...,DonaldTrump
3,3,Oct 8,784767399442653184,/realDonaldTrump/status/784767399442653184,False,Certainly has been an interesting 24 hours!,DonaldTrump
4,4,Oct 10,785561269571026944,/realDonaldTrump/status/785561269571026946,False,Debate polls look great - thank you!\n#MAGA #A...,DonaldTrump


In [2]:
len(df)

17216

In [3]:
df_clean = df.drop(columns=["Unnamed: 0","date","id","link","retweet","author"])
df_clean.head()

Unnamed: 0,text
0,Here is my statement.pic.twitter.com/WAZiGoQqMQ
1,Is this really America? Terrible!pic.twitter.c...
2,The media and establishment want me out of the...
3,Certainly has been an interesting 24 hours!
4,Debate polls look great - thank you!\n#MAGA #A...


In [4]:
df_clean.isnull().sum()

text    0
dtype: int64

In [5]:
import preprocessor as p

def preprocess_tweet(row):
    text = row['text']
    text = p.clean(text)
    return text

df_clean['text'] = df_clean.apply(preprocess_tweet, axis=1)

In [6]:
df_clean.head()

Unnamed: 0,text
0,Here is my
1,Is this really America? Terrible!
2,The media and establishment want me out of the...
3,Certainly has been an interesting hours!
4,Debate polls look great - thank you!


In [7]:
def lemmatize_text(text):
    return [stemmer.stem(w) for w in w_tokenizer.tokenize(text)]

In [8]:
stemmer = PorterStemmer()
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()

df_clean['Cleaned'] = df_clean['text'].apply(lambda x:''.join([i for i in x if i not in string.punctuation]))
df_clean['Cleaned'] = df_clean.Cleaned.apply(lemmatize_text)
df_clean['Cleaned'] = [" ".join(word) for word in df_clean['Cleaned'].values]

In [9]:
df_clean.head()

Unnamed: 0,text,Cleaned
0,Here is my,here is my
1,Is this really America? Terrible!,Is thi realli america terribl
2,The media and establishment want me out of the...,the media and establish want me out of the rac...
3,Certainly has been an interesting hours!,certainli ha been an interest hour
4,Debate polls look great - thank you!,debat poll look great thank you


In [10]:
X = df_clean.Cleaned

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(X)

In [11]:
vectors.shape

(17216, 10732)

In [12]:
with open('vectors.pickle', 'wb') as output:
    pickle.dump(vectors, output)
    
with open('vectorizer.pickle', 'wb') as output:
    pickle.dump(vectorizer, output)
    
with open('df_tweet.pickle', 'wb') as output:
    pickle.dump(df_clean, output)
    
    
    
with open('vectors.pickle', 'rb') as data:
    vector = pickle.load(data)
    
with open('vectorizer.pickle', 'rb') as data:
    vectorizer = pickle.load(data)
    
with open('df_tweet.pickle', 'rb') as data:
    df = pickle.load(data)

In [13]:
def preprocessing(text):
    text = p.clean(text)
    text = [i for i in text.split() if (i not in string.punctuation)]
    text = lemmatize_text(str(text))
    return text

In [14]:
def vectorizing(text,vectorizer):
    text_vector = vectorizer.transform([str(text)])
    return text_vector

In [15]:
def similar_tweets(text_vector, vectors,df):
    cosine_similarities = linear_kernel(text_vector, vectors).flatten()
    related_docs_indices = cosine_similarities.argsort()[:-21:-1]
    tweets = df.text.loc[related_docs_indices] 
    return tweets

In [16]:
def series_toJson(series):
    return series.to_json()

In [17]:
text = "I want a great America !"
text_preprocessed = preprocessing(text)
text_vector = vectorizing(text_preprocessed,vectorizer)
tweets = similar_tweets(text_vector, vectors,df)

In [18]:
tweets

2216    ": MEDIA WANTS A PUPPET.WE WANT DONALD TRUMP T...
3272                                            I want to
3030                    Trump Will Make America GREAT!!!!
1328                            MAKE AMERICA GREAT AGAIN!
1198                            MAKE AMERICA GREAT AGAIN!
1395                        MAKE AMERICA GREAT AGAIN! ://
3588                            MAKE AMERICA GREAT AGAIN!
1241                            MAKE AMERICA GREAT AGAIN!
867                             MAKE AMERICA GREAT AGAIN!
130                             MAKE AMERICA GREAT AGAIN!
4364                       ": Make America Great Again! "
4368                            MAKE AMERICA GREAT AGAIN!
1020                            MAKE AMERICA GREAT AGAIN!
1086                            MAKE AMERICA GREAT AGAIN!
672                             MAKE AMERICA GREAT AGAIN!
582                         MAKE AMERICA GREAT AGAIN! ://
974                             MAKE AMERICA GREAT AGAIN!
1071          

In [19]:
series_toJson(tweets)

'{"2216":"\\": MEDIA WANTS A PUPPET.WE WANT DONALD TRUMP TO MAKE AMERICA GREAT AGAIN, LAST CHANCE TO MAKE AMERICA GREAT","3272":"I want to","3030":"Trump Will Make America GREAT!!!!","1328":"MAKE AMERICA GREAT AGAIN!","1198":"MAKE AMERICA GREAT AGAIN!","1395":"MAKE AMERICA GREAT AGAIN! :\\/\\/","3588":"MAKE AMERICA GREAT AGAIN!","1241":"MAKE AMERICA GREAT AGAIN!","867":"MAKE AMERICA GREAT AGAIN!","130":"MAKE AMERICA GREAT AGAIN!","4364":"\\": Make America Great Again! \\"","4368":"MAKE AMERICA GREAT AGAIN!","1020":"MAKE AMERICA GREAT AGAIN!","1086":"MAKE AMERICA GREAT AGAIN!","672":"MAKE AMERICA GREAT AGAIN!","582":"MAKE AMERICA GREAT AGAIN! :\\/\\/","974":"MAKE AMERICA GREAT AGAIN!","1071":"MAKE AMERICA GREAT AGAIN!","990":"MAKE AMERICA GREAT AGAIN!","3576":"Great!"}'