In [66]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from  sklearn.metrics  import accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
data=pd.read_csv("Downloads\processed_tweets_emojis.csv")

In [3]:
data.shape

(30338, 3)

In [4]:
data.head()

Unnamed: 0,text,emoji,target
0,"Zach <UNK> , just <UNK> into the first !",👀,👀
1,"<UNK> want yall sleep , shit does add up in <U...",😂,😂
2,remember to,👀💭,💭
3,I hope in real life there are no <UNK> clouds ...,😮😮😮,😮
4,<UNK> <UNK> ASU ! ! ! GIVE <UNK> <UNK>,😈😈😈,😈


In [6]:
def cleaning(df):
    stop = stopwords.words('english')
    df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word!= "<UNK>"]))
    df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    df['text'] = df['text'].str.replace('\d+', '')
    df['text']= df['text'].str.replace(r'[^\x00-\x7F]+', '')
    return df

In [7]:
cleaned_data=cleaning(data)

In [8]:
cleaned_data.head()

Unnamed: 0,text,emoji,target
0,"Zach , first !",👀,👀
1,"want yall sleep , shit add boy",😂,😂
2,remember,👀💭,💭
3,I hope real life clouds make,😮😮😮,😮
4,ASU ! ! ! GIVE,😈😈😈,😈


In [14]:
def selecting_top_50(df):
    #dropping all emojis with less than 5 occurrence
    occurences=df["target"].value_counts().to_dict()
    l=[]
    for i in occurences.keys():
        if occurences[i] > 5:
            l.append(i)
    df = df[df.target.isin(l)]
    return df

In [15]:
top50=selecting_top_50(data)

In [19]:
top50.head()

Unnamed: 0,text,emoji,target
0,"Zach , first !",👀,👀
1,"want yall sleep , shit add boy",😂,😂
2,remember,👀💭,💭
3,I hope real life clouds make,😮😮😮,😮
4,ASU ! ! ! GIVE,😈😈😈,😈


In [31]:
def stemming(df):
    stemmer = SnowballStemmer("english")
    df['stemmed'] = df["text"].apply(lambda x: [stemmer.stem(y) for y in x.split()])
    df['stemmed']=[" ".join(i) for i in df['stemmed'].values]
    df=df[df.stemmed != ""]
    df.drop_duplicates(inplace=True)
    lb_make = LabelEncoder()
    df["main_target"] = lb_make.fit_transform(df["target"])
    return df

In [32]:
stemmed_data=stemming(top50)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-

In [33]:
stemmed_data.head()

Unnamed: 0,text,emoji,target,stemmed,main_target
0,"Zach , first !",👀,👀,"zach , first !",150
1,"want yall sleep , shit add boy",😂,😂,"want yall sleep , shit add boy",263
2,remember,👀💭,💭,rememb,211
3,I hope real life clouds make,😮😮😮,😮,i hope real life cloud make,306
4,ASU ! ! ! GIVE,😈😈😈,😈,asu ! ! ! give,269


In [61]:
def test_train_split(df):
    X=df["stemmed"]
    y=df["main_target"]
    X_train, X_test, y_train, y_test = train_test_split(
         X, y, test_size=0.10, random_state=42)
    return X_train, X_test, y_train, y_test

In [62]:
X_train, X_test, y_train, y_test = test_train_split(stemmed_data)

In [63]:
def tfidf(X_train, X_test):
    vectorizer = TfidfVectorizer()
    train_vectors = vectorizer.fit_transform(X_train)
    test_vectors = vectorizer.transform(X_test)
    return train_vectors,test_vectors

In [64]:
vec_train,vec_test=tfidf(X_train, X_test)

In [65]:
clf = MultinomialNB().fit(vec_train, y_train)
predicted = clf.predict(vec_test)
print(accuracy_score(y_test,predicted))

0.136022514071


In [68]:
clf = RandomForestClassifier(n_estimators=10,
                              random_state=0)
clf.fit(vec_train, y_train)
predicted = clf.predict(vec_test)
print(accuracy_score(y_test,predicted))

0.103189493433


In [69]:
from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import LinearSVC
predicted=OneVsOneClassifier(LinearSVC(random_state=0)).fit(vec_train, y_train).predict(vec_test)
print(accuracy_score(y_test,predicted))

0.142589118199
