In [1]:
#Importing Libraries
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from  sklearn.metrics  import accuracy_score
from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

In [2]:
#Reading data
data=pd.read_csv("Downloads\processed_tweets_emojis.csv")

In [3]:
data.shape

(30338, 3)

In [4]:
data.head()

Unnamed: 0,text,emoji,target
0,"Zach <UNK> , just <UNK> into the first !",👀,👀
1,"<UNK> want yall sleep , shit does add up in <U...",😂,😂
2,remember to,👀💭,💭
3,I hope in real life there are no <UNK> clouds ...,😮😮😮,😮
4,<UNK> <UNK> ASU ! ! ! GIVE <UNK> <UNK>,😈😈😈,😈


In [5]:
#Data Cleaning
def cleaning(df):
    stop = stopwords.words('english')
    df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word!= "<UNK>"])) #removing <UNK> from text
    df['text'] = df['text'].str.replace('\W', ' ') #removig special characters
    df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)])) #removing stopwords
    df['text'] = df['text'].str.replace('\d+', '') #removing digets
    df['text']= df['text'].str.replace(r'[^\x00-\x7F]+', '') #removing chinese characters
    return df

In [6]:
cleaned_data=cleaning(data)

In [7]:
cleaned_data.head()

Unnamed: 0,text,emoji,target
0,Zach first,👀,👀
1,want yall sleep shit add boy,😂,😂
2,remember,👀💭,💭
3,I hope real life clouds make,😮😮😮,😮
4,ASU GIVE,😈😈😈,😈


In [9]:
def selecting_top_50(df):
    #dropping all emojis with less than 50 ocurrence
    occurences=df["target"].value_counts().to_dict()
    l=[]
    for i in occurences.keys():
        if occurences[i] >=50:
            l.append(i)
    df = df[df.target.isin(l)]
    return df

In [10]:
top50=selecting_top_50(data)

In [11]:
top50.head()

Unnamed: 0,text,emoji,target
0,Zach first,👀,👀
1,want yall sleep shit add boy,😂,😂
4,ASU GIVE,😈😈😈,😈
5,Why fine tho,👀👀,👀
6,This hilarious,😂,😂


In [12]:
def stemming(df):
    #stemming the text 
    stemmer = SnowballStemmer("english")
    df['stemmed'] = df["text"].apply(lambda x: [stemmer.stem(y) for y in x.split()])
    df['stemmed']=[" ".join(i) for i in df['stemmed'].values]
    df=df[df.stemmed != ""]
    df.drop_duplicates(inplace=True) #dropping duplicates
    lb_make = LabelEncoder() #label encoding target variable
    df["main_target"] = lb_make.fit_transform(df["target"])
    return df

In [13]:
stemmed_data=stemming(top50)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-

In [14]:
stemmed_data.head()

Unnamed: 0,text,emoji,target,stemmed,main_target
0,Zach first,👀,👀,zach first,29
1,want yall sleep shit add boy,😂,😂,want yall sleep shit add boy,65
4,ASU GIVE,😈😈😈,😈,asu give,70
5,Why fine tho,👀👀,👀,whi fine tho,29
6,This hilarious,😂,😂,this hilari,65


In [15]:
def test_train_split(df):
    #splitting the data
    X=df["stemmed"]
    y=df["main_target"]
    X_train, X_test, y_train, y_test = train_test_split(
         X, y, test_size=0.10, random_state=42)
    return X_train, X_test, y_train, y_test

In [16]:
X_train, X_test, y_train, y_test = test_train_split(stemmed_data)

In [17]:
def tfidf(X_train, X_test):
    # vectorization before prediction
    vectorizer = TfidfVectorizer()
    train_vectors = vectorizer.fit_transform(X_train)
    test_vectors = vectorizer.transform(X_test)
    return train_vectors,test_vectors

In [18]:
vec_train,vec_test=tfidf(X_train, X_test)

In [19]:
# Prediction using Naive Bayes
clf = MultinomialNB().fit(vec_train, y_train)
predicted = clf.predict(vec_test)
print(accuracy_score(y_test,predicted))

0.164866401364


In [20]:
# Prediction using RandomForest Classifier

clf = RandomForestClassifier(n_estimators=10,
                              random_state=0)
clf.fit(vec_train, y_train)
predicted = clf.predict(vec_test)
print(accuracy_score(y_test,predicted))

0.135304150085


In [21]:
# Prediction using OneVsOneClassifier
predicted=OneVsOneClassifier(LinearSVC(random_state=0)).fit(vec_train, y_train).predict(vec_test)
print(accuracy_score(y_test,predicted))

0.170551449687
